From 954fa329d630654a36a1ec11cf4c8b4d25ad2a4a Mon Sep 17 00:00:00 2001 From: Tom Edwards <edwardstj1@cardiff.ac.uk> Date: Mon, 3 Feb 2025 18:36:18 +0000 Subject: [PATCH] build for DG --- func/collocation/collocation.py | 4 ++-- func/concordance/concordance.py | 17 +++++++++++++---- func/ner/ner.py | 3 ++- func/translation/translation.py | 3 ++- func/usasFine/usasFine.py | 5 +++-- main.py | 2 +- requirements.txt | 2 +- shared/translate.py | 1 + 8 files changed, 25 insertions(+), 12 deletions(-) diff --git a/func/collocation/collocation.py b/func/collocation/collocation.py index 6ca9db5..0709e97 100644 --- a/func/collocation/collocation.py +++ b/func/collocation/collocation.py @@ -1,6 +1,6 @@ import spacy import math -from shared.translate import translate +#from shared.translate import translate import nltk from nltk.collocations import TrigramCollocationFinder from nltk.metrics import TrigramAssocMeasures @@ -60,7 +60,7 @@ def run_collocation_on_text(page): itemstr = " ".join(i for i in item[0]) if '部' in itemstr: itemstrnew = itemstr.replace('部','').strip().replace(' ','') - translation = translate(itemstr.replace('部','').strip()).text.lower() + #translation = translate(itemstr.replace('部','').strip()).text.lower() #print(translation) #print('--------------') score = round(item[1],3) diff --git a/func/concordance/concordance.py b/func/concordance/concordance.py index b9fe3fd..5a0fe24 100644 --- a/func/concordance/concordance.py +++ b/func/concordance/concordance.py @@ -1,5 +1,5 @@ import spacy -from shared.translate import translate +#from shared.translate import translate from wasabi import Printer from spacy.matcher import PhraseMatcher from db.db_config import get_db @@ -50,7 +50,7 @@ def collocations(): itemstr = " ".join(i for i in item[0]) if '部' in itemstr: itemstrnew = itemstr - translation = translate(itemstr).text.lower() + #translation = translate(itemstr).text.lower() # print(translation) # print('--------------') score = round(item[1], 3) @@ -65,6 +65,8 @@ def collocations(): def run_concordance_on_text(page): + #print('page: ',page) + page = page+'部' nlp = spacy.load('zh_core_web_sm') conn, cursor = get_db() cursor.execute('SELECT * from news;') @@ -77,10 +79,13 @@ def run_concordance_on_text(page): concordances = [] terms = collocations() + + #terms = [page] for i in range(0, len(data)): id = data[i][0] txt = data[i][1] doc = nlp(txt) + #print('txt: ',txt) matcher = PhraseMatcher(nlp.vocab,attr='LOWER') @@ -88,7 +93,7 @@ def run_concordance_on_text(page): matcher.add("TermCollocations", patterns) matches = matcher(doc) - match = Printer() + #match = Printer() for j, start, end in matches: perecedingSlice = doc[start - 20: start].text if '。' in perecedingSlice: @@ -97,10 +102,13 @@ def run_concordance_on_text(page): perecedingSlice = perecedingSlice.strip() + #perecedingSliceTr = clean(translate(doc[start - 20: start]).text) matchedTerm = doc[start:end].text + + #matchedTerm = doc[start:end].text - matchedTermTr = match.text(translate(doc[start:end].text).text, color='red', no_print=True) + #matchedTermTr = match.text(translate(doc[start:end].text).text, color='red', no_print=True) #matchedTermTr = match.text(translate(doc[start:end].text).text) followingSlice = doc[end:end + 20].text #followingSliceTr = clean(translate(doc[end:end + 20]).text) @@ -109,6 +117,7 @@ def run_concordance_on_text(page): #contextTr = perecedingSliceTr+', '+matchedTermTr+', '+followingSliceTr #concordances.append({"0 Term": escapeAnscii(matchedTerm), "1 Eng": escapeAnscii(matchedTermTr), "2 Context":escapeAnscii(context), "3 Context Eng":escapeAnscii(contextTr)}) + concordances.append({"0 Preceded By":perecedingSlice,"1 Term": matchedTerm, "2 Followed By": followingSlice}) diff --git a/func/ner/ner.py b/func/ner/ner.py index e783903..5fe9a5f 100644 --- a/func/ner/ner.py +++ b/func/ner/ner.py @@ -9,6 +9,7 @@ from db.db_config import get_db #page = '尼罗河 是一æ¡æµç¶“éžæ´²æ±éƒ¨èˆ‡åŒ—部的河æµï¼Œèˆ‡ä¸éžåœ°å€çš„剛果河ã€éžæ´²å—部的赞比西河以åŠè¥¿éžåœ°åŒºçš„尼日尔河並列éžæ´²æœ€å¤§çš„四個河æµç³»çµ±ã€‚' # Perform NER on Text def run_ner_on_text(page): + print('NER tag: ',page) ner_driver = CkipNerChunker(model="bert-base") conn, cursor = get_db() cursor.execute('SELECT * from news;') @@ -41,7 +42,7 @@ def run_ner_on_text(page): word = tag.split('__')[0] ner = tag.split('__')[1] #translation = translate(word).text - if ner == 'PERSON': + if ner == page: ner_words_with_count.append({"0 Word": word, "1 NER": ner, "2 Frequency": freq}) seen_words.append(tag) diff --git a/func/translation/translation.py b/func/translation/translation.py index 190bd6b..deeca7a 100644 --- a/func/translation/translation.py +++ b/func/translation/translation.py @@ -2,8 +2,9 @@ import html_to_json from shared.translate import translate -# Perform NER on Text +# Translate text def run_translation_on_text(page): + #print('page from translation.py: ',page) try: output_json = html_to_json.convert_tables(page) output = output_json[0] diff --git a/func/usasFine/usasFine.py b/func/usasFine/usasFine.py index 5255519..349e373 100644 --- a/func/usasFine/usasFine.py +++ b/func/usasFine/usasFine.py @@ -7,6 +7,7 @@ from db.db_config import get_db def run_usasFine_on_text(page): + print('tag: ',page) d = {} with open('/Users/tom/PycharmProjects/cognistance/func/usas/usas_desc.txt') as f: for line in f: @@ -15,7 +16,7 @@ def run_usasFine_on_text(page): val = lineL[1].strip() d[key] = val - print(d) + #print(d) # We exclude the following components as we do not need them. nlp = spacy.load('zh_core_web_sm', exclude=['parser', 'ner']) # Load the Chinese PyMUSAS rule-based tagger in a separate spaCy pipeline @@ -65,7 +66,7 @@ def run_usasFine_on_text(page): word = tag.split('__')[0] usas = tag.split('__')[1] - if 'A' in usas: + if page in usas: tag_object = {"0 Word":word,"1 Discourse Field": usas, "2 Definition":d[usas],"3 Frequency": freq} usas_tags_with_count.append(tag_object) diff --git a/main.py b/main.py index ea0ba98..d576f06 100644 --- a/main.py +++ b/main.py @@ -28,7 +28,7 @@ def ner(): return result -@app.route("/translate", methods=['POST']) +@app.route("/translation", methods=['POST']) def translate(): request_data = request.get_json() diff --git a/requirements.txt b/requirements.txt index 11e587f..2c915fa 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,4 @@ ckip-transformers~=0.3.4 flask-cors~=4.0.1 spacy~=3.7.4 googletrans ~=3.1.0a0 -pandas ~=2.2.3 \ No newline at end of file +pandas ~=2.2. \ No newline at end of file diff --git a/shared/translate.py b/shared/translate.py index d269dd1..c51c845 100644 --- a/shared/translate.py +++ b/shared/translate.py @@ -10,6 +10,7 @@ def translate(word): translator = Translator() result = translator.translate(word, src='zh-cn', dest='en') + print('page from translate.py: ', result) return result -- GitLab