diff --git a/db/datasets.db b/db/datasets.db index 8489e3980dc54d847378b5dff7902c70befe1462..cde875a2ec60e84f813b1fc79f999e817796427e 100644 Binary files a/db/datasets.db and b/db/datasets.db differ diff --git a/func/collocation/collocation.py b/func/collocation/collocation.py index c003be77e5f8a1a2a44a27d43898316836163449..185024aee60d5c7239e9a8419ff2388a87b38ced 100644 --- a/func/collocation/collocation.py +++ b/func/collocation/collocation.py @@ -7,11 +7,25 @@ from nltk.metrics import TrigramAssocMeasures from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures from db.db_config import get_db - +from googletrans import Translator +import asyncio def run_collocation_on_text(page): - datasetid = page.split('><p>')[0].replace('<div id=', '').replace('"', '').strip() - print('dataset id in run_collocation_on_text: ',datasetid) + translator = Translator() + datasetid = page.split('__')[0] + inputstring = page.split('__')[1] + #detectlanguage = asyncio.run(translator.detect(inputstring)) + #print('detected language: ',detectlanguage) + inputstringCh = asyncio.run(translator.translate(inputstring, src='en', dest='zh-cn')).text.strip() + #inputstringCh = translatech(inputstring).text.strip() + print('inputstring original: ',inputstring) + print('inputstringCh translated: ',inputstringCh) + print('-----------------------------') + + #datasetid = page.split('><p>')[0].replace('<div id=', '').replace('"', '').strip() + #print('dataset id in run_collocation_on_text: ',datasetid) + + collocations = [] nlp = spacy.load('zh_core_web_sm') @@ -61,8 +75,11 @@ def run_collocation_on_text(page): #allscores = scoredbigrams+scoretrigrams for item in scoredbigrams: itemstr = " ".join(i for i in item[0]) - if '部' in itemstr: - itemstrnew = itemstr.replace('部','').strip().replace(' ','') + #if '部' in itemstr: + if inputstringCh in itemstr: + #itemstrnew = itemstr.replace('部','').strip().replace(' ','') + itemstrnew = itemstr.replace(inputstringCh, '').strip().replace(' ', '') + #print('itemstrnew: ',itemstrnew) #translation = translate(itemstr.replace('部','').strip()).text.lower() #print(translation) #print('--------------') diff --git a/func/concordance/concordance.py b/func/concordance/concordance.py index 71e7c12b254d296e1dbdbcfe3a808bbbbfb4e7c1..166cfb5d969b082cca10186bd01d2db17e2f00da 100644 --- a/func/concordance/concordance.py +++ b/func/concordance/concordance.py @@ -5,10 +5,12 @@ from spacy.matcher import PhraseMatcher from db.db_config import get_db from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures +from googletrans import Translator +import asyncio -def collocations(datasetid): +def collocations(datasetid,word): collocations = [] nlp = spacy.load('zh_core_web_sm') @@ -49,7 +51,8 @@ def collocations(datasetid): # allscores = scoredbigrams+scoretrigrams for item in scoredbigrams: itemstr = " ".join(i for i in item[0]) - if '部' in itemstr: + #if '部' in itemstr: + if word in itemstr: itemstrnew = itemstr #translation = translate(itemstr).text.lower() # print(translation) @@ -66,8 +69,14 @@ def collocations(datasetid): def run_concordance_on_text(page): - datasetid = page.replace("<p>Collocations for the word '部' (department) for ",'').replace('</p>','').strip() + translator = Translator() + datasetid = page.replace('<p>Collocations for','').replace('for dataset','').replace('</p>','').split()[1].strip() + word = page.replace('<p>Collocations for','').replace('for dataset','').split()[0].strip() + + wordch = asyncio.run(translator.translate(word, src='en', dest='zh-cn')).text.strip() + print('datasetid inside run_concordance_on_text: ',datasetid) + print('word inside run_concordance_on_text: ', wordch) #page = page+'部' nlp = spacy.load('zh_core_web_sm') conn, cursor = get_db() @@ -81,7 +90,7 @@ def run_concordance_on_text(page): data.append([docid, content]) concordances = [] - terms = collocations(datasetid) + terms = collocations(datasetid,wordch) #terms = [page] for i in range(0, len(data)): diff --git a/func/translation/translation.py b/func/translation/translation.py index deeca7a8774700a8d05b1fdf5ecb23ff1c0d8d02..bf74b33e067912aa0fd5af6ff74822c67ad5a11c 100644 --- a/func/translation/translation.py +++ b/func/translation/translation.py @@ -2,6 +2,7 @@ import html_to_json from shared.translate import translate + # Translate text def run_translation_on_text(page): #print('page from translation.py: ',page)