Skip to content
Snippets Groups Projects
Commit 954fa329 authored by Thomas Edwards's avatar Thomas Edwards
Browse files

build for DG

parent f23a52a7
No related branches found
No related tags found
No related merge requests found
import spacy import spacy
import math import math
from shared.translate import translate #from shared.translate import translate
import nltk import nltk
from nltk.collocations import TrigramCollocationFinder from nltk.collocations import TrigramCollocationFinder
from nltk.metrics import TrigramAssocMeasures from nltk.metrics import TrigramAssocMeasures
...@@ -60,7 +60,7 @@ def run_collocation_on_text(page): ...@@ -60,7 +60,7 @@ def run_collocation_on_text(page):
itemstr = " ".join(i for i in item[0]) itemstr = " ".join(i for i in item[0])
if '' in itemstr: if '' in itemstr:
itemstrnew = itemstr.replace('','').strip().replace(' ','') itemstrnew = itemstr.replace('','').strip().replace(' ','')
translation = translate(itemstr.replace('','').strip()).text.lower() #translation = translate(itemstr.replace('部','').strip()).text.lower()
#print(translation) #print(translation)
#print('--------------') #print('--------------')
score = round(item[1],3) score = round(item[1],3)
......
import spacy import spacy
from shared.translate import translate #from shared.translate import translate
from wasabi import Printer from wasabi import Printer
from spacy.matcher import PhraseMatcher from spacy.matcher import PhraseMatcher
from db.db_config import get_db from db.db_config import get_db
...@@ -50,7 +50,7 @@ def collocations(): ...@@ -50,7 +50,7 @@ def collocations():
itemstr = " ".join(i for i in item[0]) itemstr = " ".join(i for i in item[0])
if '' in itemstr: if '' in itemstr:
itemstrnew = itemstr itemstrnew = itemstr
translation = translate(itemstr).text.lower() #translation = translate(itemstr).text.lower()
# print(translation) # print(translation)
# print('--------------') # print('--------------')
score = round(item[1], 3) score = round(item[1], 3)
...@@ -65,6 +65,8 @@ def collocations(): ...@@ -65,6 +65,8 @@ def collocations():
def run_concordance_on_text(page): def run_concordance_on_text(page):
#print('page: ',page)
page = page+''
nlp = spacy.load('zh_core_web_sm') nlp = spacy.load('zh_core_web_sm')
conn, cursor = get_db() conn, cursor = get_db()
cursor.execute('SELECT * from news;') cursor.execute('SELECT * from news;')
...@@ -77,10 +79,13 @@ def run_concordance_on_text(page): ...@@ -77,10 +79,13 @@ def run_concordance_on_text(page):
concordances = [] concordances = []
terms = collocations() terms = collocations()
#terms = [page]
for i in range(0, len(data)): for i in range(0, len(data)):
id = data[i][0] id = data[i][0]
txt = data[i][1] txt = data[i][1]
doc = nlp(txt) doc = nlp(txt)
#print('txt: ',txt)
matcher = PhraseMatcher(nlp.vocab,attr='LOWER') matcher = PhraseMatcher(nlp.vocab,attr='LOWER')
...@@ -88,7 +93,7 @@ def run_concordance_on_text(page): ...@@ -88,7 +93,7 @@ def run_concordance_on_text(page):
matcher.add("TermCollocations", patterns) matcher.add("TermCollocations", patterns)
matches = matcher(doc) matches = matcher(doc)
match = Printer() #match = Printer()
for j, start, end in matches: for j, start, end in matches:
perecedingSlice = doc[start - 20: start].text perecedingSlice = doc[start - 20: start].text
if '' in perecedingSlice: if '' in perecedingSlice:
...@@ -97,10 +102,13 @@ def run_concordance_on_text(page): ...@@ -97,10 +102,13 @@ def run_concordance_on_text(page):
perecedingSlice = perecedingSlice.strip() perecedingSlice = perecedingSlice.strip()
#perecedingSliceTr = clean(translate(doc[start - 20: start]).text) #perecedingSliceTr = clean(translate(doc[start - 20: start]).text)
matchedTerm = doc[start:end].text matchedTerm = doc[start:end].text
#matchedTerm = doc[start:end].text #matchedTerm = doc[start:end].text
matchedTermTr = match.text(translate(doc[start:end].text).text, color='red', no_print=True) #matchedTermTr = match.text(translate(doc[start:end].text).text, color='red', no_print=True)
#matchedTermTr = match.text(translate(doc[start:end].text).text) #matchedTermTr = match.text(translate(doc[start:end].text).text)
followingSlice = doc[end:end + 20].text followingSlice = doc[end:end + 20].text
#followingSliceTr = clean(translate(doc[end:end + 20]).text) #followingSliceTr = clean(translate(doc[end:end + 20]).text)
...@@ -109,6 +117,7 @@ def run_concordance_on_text(page): ...@@ -109,6 +117,7 @@ def run_concordance_on_text(page):
#contextTr = perecedingSliceTr+', '+matchedTermTr+', '+followingSliceTr #contextTr = perecedingSliceTr+', '+matchedTermTr+', '+followingSliceTr
#concordances.append({"0 Term": escapeAnscii(matchedTerm), "1 Eng": escapeAnscii(matchedTermTr), "2 Context":escapeAnscii(context), "3 Context Eng":escapeAnscii(contextTr)}) #concordances.append({"0 Term": escapeAnscii(matchedTerm), "1 Eng": escapeAnscii(matchedTermTr), "2 Context":escapeAnscii(context), "3 Context Eng":escapeAnscii(contextTr)})
concordances.append({"0 Preceded By":perecedingSlice,"1 Term": matchedTerm, "2 Followed By": followingSlice}) concordances.append({"0 Preceded By":perecedingSlice,"1 Term": matchedTerm, "2 Followed By": followingSlice})
......
...@@ -9,6 +9,7 @@ from db.db_config import get_db ...@@ -9,6 +9,7 @@ from db.db_config import get_db
#page = '尼罗河 是一条流經非洲東部與北部的河流,與中非地區的剛果河、非洲南部的赞比西河以及西非地区的尼日尔河並列非洲最大的四個河流系統。' #page = '尼罗河 是一条流經非洲東部與北部的河流,與中非地區的剛果河、非洲南部的赞比西河以及西非地区的尼日尔河並列非洲最大的四個河流系統。'
# Perform NER on Text # Perform NER on Text
def run_ner_on_text(page): def run_ner_on_text(page):
print('NER tag: ',page)
ner_driver = CkipNerChunker(model="bert-base") ner_driver = CkipNerChunker(model="bert-base")
conn, cursor = get_db() conn, cursor = get_db()
cursor.execute('SELECT * from news;') cursor.execute('SELECT * from news;')
...@@ -41,7 +42,7 @@ def run_ner_on_text(page): ...@@ -41,7 +42,7 @@ def run_ner_on_text(page):
word = tag.split('__')[0] word = tag.split('__')[0]
ner = tag.split('__')[1] ner = tag.split('__')[1]
#translation = translate(word).text #translation = translate(word).text
if ner == 'PERSON': if ner == page:
ner_words_with_count.append({"0 Word": word, "1 NER": ner, "2 Frequency": freq}) ner_words_with_count.append({"0 Word": word, "1 NER": ner, "2 Frequency": freq})
seen_words.append(tag) seen_words.append(tag)
......
...@@ -2,8 +2,9 @@ import html_to_json ...@@ -2,8 +2,9 @@ import html_to_json
from shared.translate import translate from shared.translate import translate
# Perform NER on Text # Translate text
def run_translation_on_text(page): def run_translation_on_text(page):
#print('page from translation.py: ',page)
try: try:
output_json = html_to_json.convert_tables(page) output_json = html_to_json.convert_tables(page)
output = output_json[0] output = output_json[0]
......
...@@ -7,6 +7,7 @@ from db.db_config import get_db ...@@ -7,6 +7,7 @@ from db.db_config import get_db
def run_usasFine_on_text(page): def run_usasFine_on_text(page):
print('tag: ',page)
d = {} d = {}
with open('/Users/tom/PycharmProjects/cognistance/func/usas/usas_desc.txt') as f: with open('/Users/tom/PycharmProjects/cognistance/func/usas/usas_desc.txt') as f:
for line in f: for line in f:
...@@ -15,7 +16,7 @@ def run_usasFine_on_text(page): ...@@ -15,7 +16,7 @@ def run_usasFine_on_text(page):
val = lineL[1].strip() val = lineL[1].strip()
d[key] = val d[key] = val
print(d) #print(d)
# We exclude the following components as we do not need them. # We exclude the following components as we do not need them.
nlp = spacy.load('zh_core_web_sm', exclude=['parser', 'ner']) nlp = spacy.load('zh_core_web_sm', exclude=['parser', 'ner'])
# Load the Chinese PyMUSAS rule-based tagger in a separate spaCy pipeline # Load the Chinese PyMUSAS rule-based tagger in a separate spaCy pipeline
...@@ -65,7 +66,7 @@ def run_usasFine_on_text(page): ...@@ -65,7 +66,7 @@ def run_usasFine_on_text(page):
word = tag.split('__')[0] word = tag.split('__')[0]
usas = tag.split('__')[1] usas = tag.split('__')[1]
if 'A' in usas: if page in usas:
tag_object = {"0 Word":word,"1 Discourse Field": usas, "2 Definition":d[usas],"3 Frequency": freq} tag_object = {"0 Word":word,"1 Discourse Field": usas, "2 Definition":d[usas],"3 Frequency": freq}
usas_tags_with_count.append(tag_object) usas_tags_with_count.append(tag_object)
......
...@@ -28,7 +28,7 @@ def ner(): ...@@ -28,7 +28,7 @@ def ner():
return result return result
@app.route("/translate", methods=['POST']) @app.route("/translation", methods=['POST'])
def translate(): def translate():
request_data = request.get_json() request_data = request.get_json()
......
...@@ -5,4 +5,4 @@ ckip-transformers~=0.3.4 ...@@ -5,4 +5,4 @@ ckip-transformers~=0.3.4
flask-cors~=4.0.1 flask-cors~=4.0.1
spacy~=3.7.4 spacy~=3.7.4
googletrans ~=3.1.0a0 googletrans ~=3.1.0a0
pandas ~=2.2.3 pandas ~=2.2.
\ No newline at end of file \ No newline at end of file
...@@ -10,6 +10,7 @@ def translate(word): ...@@ -10,6 +10,7 @@ def translate(word):
translator = Translator() translator = Translator()
result = translator.translate(word, src='zh-cn', dest='en') result = translator.translate(word, src='zh-cn', dest='en')
print('page from translate.py: ', result)
return result return result
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment