Skip to content
Snippets Groups Projects
Commit 24abc819 authored by Thomas Edwards's avatar Thomas Edwards
Browse files

demo branch

parent 03303cee
No related branches found
No related tags found
1 merge request!1Demo branch
No preview for this file type
......@@ -7,11 +7,25 @@ from nltk.metrics import TrigramAssocMeasures
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from db.db_config import get_db
from googletrans import Translator
import asyncio
def run_collocation_on_text(page):
datasetid = page.split('><p>')[0].replace('<div id=', '').replace('"', '').strip()
print('dataset id in run_collocation_on_text: ',datasetid)
translator = Translator()
datasetid = page.split('__')[0]
inputstring = page.split('__')[1]
#detectlanguage = asyncio.run(translator.detect(inputstring))
#print('detected language: ',detectlanguage)
inputstringCh = asyncio.run(translator.translate(inputstring, src='en', dest='zh-cn')).text.strip()
#inputstringCh = translatech(inputstring).text.strip()
print('inputstring original: ',inputstring)
print('inputstringCh translated: ',inputstringCh)
print('-----------------------------')
#datasetid = page.split('><p>')[0].replace('<div id=', '').replace('"', '').strip()
#print('dataset id in run_collocation_on_text: ',datasetid)
collocations = []
nlp = spacy.load('zh_core_web_sm')
......@@ -61,8 +75,11 @@ def run_collocation_on_text(page):
#allscores = scoredbigrams+scoretrigrams
for item in scoredbigrams:
itemstr = " ".join(i for i in item[0])
if '' in itemstr:
itemstrnew = itemstr.replace('','').strip().replace(' ','')
#if '部' in itemstr:
if inputstringCh in itemstr:
#itemstrnew = itemstr.replace('部','').strip().replace(' ','')
itemstrnew = itemstr.replace(inputstringCh, '').strip().replace(' ', '')
#print('itemstrnew: ',itemstrnew)
#translation = translate(itemstr.replace('部','').strip()).text.lower()
#print(translation)
#print('--------------')
......
......@@ -5,10 +5,12 @@ from spacy.matcher import PhraseMatcher
from db.db_config import get_db
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from googletrans import Translator
import asyncio
def collocations(datasetid):
def collocations(datasetid,word):
collocations = []
nlp = spacy.load('zh_core_web_sm')
......@@ -49,7 +51,8 @@ def collocations(datasetid):
# allscores = scoredbigrams+scoretrigrams
for item in scoredbigrams:
itemstr = " ".join(i for i in item[0])
if '' in itemstr:
#if '部' in itemstr:
if word in itemstr:
itemstrnew = itemstr
#translation = translate(itemstr).text.lower()
# print(translation)
......@@ -66,8 +69,14 @@ def collocations(datasetid):
def run_concordance_on_text(page):
datasetid = page.replace("<p>Collocations for the word '' (department) for ",'').replace('</p>','').strip()
translator = Translator()
datasetid = page.replace('<p>Collocations for','').replace('for dataset','').replace('</p>','').split()[1].strip()
word = page.replace('<p>Collocations for','').replace('for dataset','').split()[0].strip()
wordch = asyncio.run(translator.translate(word, src='en', dest='zh-cn')).text.strip()
print('datasetid inside run_concordance_on_text: ',datasetid)
print('word inside run_concordance_on_text: ', wordch)
#page = page+'部'
nlp = spacy.load('zh_core_web_sm')
conn, cursor = get_db()
......@@ -81,7 +90,7 @@ def run_concordance_on_text(page):
data.append([docid, content])
concordances = []
terms = collocations(datasetid)
terms = collocations(datasetid,wordch)
#terms = [page]
for i in range(0, len(data)):
......
......@@ -2,6 +2,7 @@ import html_to_json
from shared.translate import translate
# Translate text
def run_translation_on_text(page):
#print('page from translation.py: ',page)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment