Skip to content
Snippets Groups Projects
Commit f23a52a7 authored by Thomas Edwards's avatar Thomas Edwards
Browse files

demo

parent df73f011
No related branches found
No related tags found
No related merge requests found
...@@ -8,6 +8,7 @@ from func.collocation.collocation import * ...@@ -8,6 +8,7 @@ from func.collocation.collocation import *
from func.concordance.concordance import * from func.concordance.concordance import *
from func.mutlidatasets.multidatasets import * from func.mutlidatasets.multidatasets import *
from func.neroverall.neroverall import * from func.neroverall.neroverall import *
from func.usasFine.usasFine import *
# Perform NER on a file # Perform NER on a file
# TAKES XML text page # TAKES XML text page
...@@ -44,6 +45,14 @@ def get_usas_for_data(page): ...@@ -44,6 +45,14 @@ def get_usas_for_data(page):
return make_response(jsonify(result), 400) return make_response(jsonify(result), 400)
def get_usasFine_for_data(page):
result = run_usasFine_on_text(page)
if result["code"] == "SUCCESS":
return make_response(jsonify(result), 201)
return make_response(jsonify(result), 400)
# Perform Sentiment analysis on a file # Perform Sentiment analysis on a file
# TAKES XML text page # TAKES XML text page
......
import spacy import spacy
import math import math
from shared.translate import translate from shared.translate import translate
import nltk
from nltk.collocations import TrigramCollocationFinder from nltk.collocations import TrigramCollocationFinder
from nltk.metrics import TrigramAssocMeasures from nltk.metrics import TrigramAssocMeasures
from nltk.collocations import BigramCollocationFinder from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures from nltk.metrics import BigramAssocMeasures
from db.db_config import get_db
#page = '专精特新”企业,是指具有专业化、精细化、特色化、新颖化四大特征的中小企业。创新是这类企业的灵魂,足够的 研发费用投入则是开展创新的重要保障。许多尚处在成长期的“专精特新”企业,近期普遍遭遇“钱紧”难题。如何集聚 更多的资金投入研发、保持创新领先地位是这些企业近来面临的最大烦恼。“作为一家新材料研发公司,创新是我们发展的重要驱动力,只有研发投入的不断加码,企业创新发展的步伐才不会 降速。”浙江省“专精特新”企业、宁波创润新材料有限公司董事长吴景晖说,过去3年,企业在研发投入方面不遗余力,累计投入2500万元,这对企业来说不是个小数目。 今年新兴市场的研发需求十分迫切,我们一直想加快 超高纯钛及钛合金中试生产线项目 的研发进度,但苦于资金不 足。令人高兴的是,今年4月340万元存量增值税留抵税额的到账,有效缓解了企业的资金压力,加快了企业的研发 进度。”吴景晖说,目前,“超高纯钛及钛合金中试生产线项目”正在有序推进,一旦投产将缓解半导体产业的高纯钛原材料供应不足问题,提升国产溅射靶材的市场竞争力' def run_collocation_on_text(page):
collocations = []
nlp = spacy.load('zh_core_web_sm')
conn, cursor = get_db()
cursor.execute('SELECT * from news;')
res = cursor.fetchall()
data = []
def clean(text): for row in res:
text = text.replace('<p>', ' ') docid = row[0]
text = text.replace('</p>', ' ')
text = text.replace('<br>', ' ')
text = text.replace('</br>', ' ')
text = text.replace('><', ' ')
text = text.replace('\u3000', ' ')
text = text.replace('br', ' ')
cltext = text.replace('\n', ' ').strip()
return str(cltext)
def run_collocation_on_text(page): content = row[-1].replace('\n', ' ').replace('\t', ' ')
page = clean(page) data.append([docid, content])
corpus = [] corpus = []
collocations = [] for i in range(0, len(data)):
id = data[i][0]
txt = data[i][1]
doc = nlp(txt)
nlp = spacy.load('zh_core_web_sm')
doc = nlp(page)
for token in doc: for token in doc:
if not token.is_stop: if not token.is_stop:
# print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,token.shape_, token.is_alpha, token.is_stop) # print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,token.shape_, token.is_alpha, token.is_stop)
corpus.append(token.text.lower()) corpus.append(token.text.lower())
biagram_collocation = BigramCollocationFinder.from_words(corpus) biagram_collocation = BigramCollocationFinder.from_words(corpus)
#biagram_collocation.apply_freq_filter(3) #biagram_collocation.apply_freq_filter(3)
trigram_collocation = TrigramCollocationFinder.from_words(corpus) #trigram_collocation = TrigramCollocationFinder.from_words(corpus)
#trigram_collocation.apply_freq_filter(3) #trigram_collocation.apply_freq_filter(3)
scoredbigrams = biagram_collocation.score_ngrams(BigramAssocMeasures().likelihood_ratio)[:10]
scoretrigrams = trigram_collocation.score_ngrams(TrigramAssocMeasures().likelihood_ratio)[:10] scoredbigrams = biagram_collocation.score_ngrams(BigramAssocMeasures().likelihood_ratio)
allscores = scoredbigrams+scoretrigrams bigramterms = []
for item in allscores: for j in scoredbigrams:
jstr = " ".join(b for b in j[0])
bigramterms.append(jstr)
#scoretrigrams = trigram_collocation.score_ngrams(TrigramAssocMeasures().likelihood_ratio)
#allscores = scoredbigrams+scoretrigrams
for item in scoredbigrams:
itemstr = " ".join(i for i in item[0]) itemstr = " ".join(i for i in item[0])
translation = translate(itemstr).text.lower() if '' in itemstr:
score = item[1]/1000000 itemstrnew = itemstr.replace('','').strip().replace(' ','')
collocations.append({"0 Term": itemstr,"1 Translation":translation ,"2 LogRatio": score}) translation = translate(itemstr.replace('','').strip()).text.lower()
#print(translation)
#print('--------------')
score = round(item[1],3)
freq = bigramterms.count(itemstr)/ 1000
collocations.append({"0 Collocate": itemstrnew , "1 LogRatio": score, "2 Frequency":freq})
collocations = sorted(collocations, key=lambda x: x["2 LogRatio"], reverse=True)
collocationsorted = sorted(collocations, key=lambda x: (x["1 LogRatio"],x["2 Frequency"]), reverse=True)[:10]
result = {'output': collocations, 'message': 'Done', 'code': 'SUCCESS'} result = {'output': collocationsorted, 'message': 'Done', 'code': 'SUCCESS'}
return result return result
import spacy import spacy
import math
from collections import Counter, defaultdict
from shared.translate import translate from shared.translate import translate
from wasabi import Printer from wasabi import Printer
from spacy.matcher import PhraseMatcher from spacy.matcher import PhraseMatcher
import re from db.db_config import get_db
from nltk.collocations import TrigramCollocationFinder
from nltk.metrics import TrigramAssocMeasures
from nltk.collocations import BigramCollocationFinder from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures from nltk.metrics import BigramAssocMeasures
#page = '专精特新”企业,是指具有专业化、精细化、特色化、新颖化四大特征的中小企业。创新是这类企业的灵魂,足够的 研发费用投入则是开展创新的重要保障。许多尚处在成长期的“专精特新”企业,近期普遍遭遇“钱紧”难题。如何集聚 更多的资金投入研发、保持创新领先地位是这些企业近来面临的最大烦恼。“作为一家新材料研发公司,创新是我们发展的重要驱动力,只有研发投入的不断加码,企业创新发展的步伐才不会 降速。”浙江省“专精特新”企业、宁波创润新材料有限公司董事长吴景晖说,过去3年,企业在研发投入方面不遗余力,累计投入2500万元,这对企业来说不是个小数目。 今年新兴市场的研发需求十分迫切,我们一直想加快 超高纯钛及钛合金中试生产线项目 的研发进度,但苦于资金不 足。令人高兴的是,今年4月340万元存量增值税留抵税额的到账,有效缓解了企业的资金压力,加快了企业的研发 进度。”吴景晖说,目前,“超高纯钛及钛合金中试生产线项目”正在有序推进,一旦投产将缓解半导体产业的高纯钛原材料供应不足问题,提升国产溅射靶材的市场竞争力'
def clean(text): def collocations():
collocations = []
text = text.replace('<p>', ' ') nlp = spacy.load('zh_core_web_sm')
text = text.replace('</p>', ' ') conn, cursor = get_db()
text = text.replace('<br>', ' ') cursor.execute('SELECT * from news;')
text = text.replace('</br>', ' ') res = cursor.fetchall()
text = text.replace('><', ' ')
text = text.replace('\u3000', ' ')
text = text.replace('br', ' ')
text = text.replace('——', '')
text = text.replace('[38;5;1m', '')
text = text.replace('[0m','')
cltext = text.replace('\n', ' ').strip()
return str(cltext)
def collocations(doc): data = []
corpus = []
collocations = []
for token in doc: for row in res:
docid = row[0]
if not token.is_stop: content = row[-1].replace('\n', ' ').replace('\t', ' ')
# print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,token.shape_, token.is_alpha, token.is_stop)
corpus.append(token.text.lower())
biagram_collocation = BigramCollocationFinder.from_words(corpus) data.append([docid, content])
# biagram_collocation.apply_freq_filter(3)
trigram_collocation = TrigramCollocationFinder.from_words(corpus)
# trigram_collocation.apply_freq_filter(3)
scoredbigrams = biagram_collocation.score_ngrams(BigramAssocMeasures().likelihood_ratio)[:10] corpus = []
for i in range(0, len(data)):
txt = data[i][1]
doc = nlp(txt)
scoretrigrams = trigram_collocation.score_ngrams(TrigramAssocMeasures().likelihood_ratio)[:10] for token in doc:
allscores = scoredbigrams + scoretrigrams if not token.is_stop:
for item in allscores: # print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,token.shape_, token.is_alpha, token.is_stop)
itemstr = " ".join(i for i in item[0]) corpus.append(token.text.lower())
translation = translate(itemstr).text.lower()
score = item[1] / 1000000 biagram_collocation = BigramCollocationFinder.from_words(corpus)
collocations.append({"0 Term": itemstr, "1 Translation": translation, "2 LogRatio": score})
collocations = sorted(collocations, key=lambda x: x["2 LogRatio"], reverse=True)
scoredbigrams = biagram_collocation.score_ngrams(BigramAssocMeasures().likelihood_ratio)
bigramterms = []
for j in scoredbigrams:
jstr = " ".join(b for b in j[0])
bigramterms.append(jstr)
# scoretrigrams = trigram_collocation.score_ngrams(TrigramAssocMeasures().likelihood_ratio)
# allscores = scoredbigrams+scoretrigrams
for item in scoredbigrams:
itemstr = " ".join(i for i in item[0])
if '' in itemstr:
itemstrnew = itemstr
translation = translate(itemstr).text.lower()
# print(translation)
# print('--------------')
score = round(item[1], 3)
terms = [item.get('0 Term') for item in collocations] freq = bigramterms.count(itemstr) / 1000
collocations.append({"0 Collocate": itemstrnew, "1 LogRatio": score, "2 Frequency": freq})
collocationsorted = sorted(collocations, key=lambda x: (x["1 LogRatio"], x["2 Frequency"]), reverse=True)[:10]
return terms terms = [d['0 Collocate'] for d in collocationsorted]
return terms
def run_concordance_on_text(page): def run_concordance_on_text(page):
page = clean(page)
print('Page')
print(page)
nlp = spacy.load('zh_core_web_sm') nlp = spacy.load('zh_core_web_sm')
doc = nlp(page) conn, cursor = get_db()
terms = collocations(doc) cursor.execute('SELECT * from news;')
res = cursor.fetchall()
data = []
for row in res:
docid = row[0]
content = row[-1].replace('\n', ' ').replace('\t', ' ')
data.append([docid, content])
concordances = [] concordances = []
matcher = PhraseMatcher(nlp.vocab,attr='LOWER') terms = collocations()
patterns = [nlp.make_doc(term) for term in terms] for i in range(0, len(data)):
matcher.add("TermCollocations", patterns) id = data[i][0]
txt = data[i][1]
matches = matcher(doc) doc = nlp(txt)
match = Printer()
for i, start, end in matches:
perecedingSlice = doc[start - 20: start].text matcher = PhraseMatcher(nlp.vocab,attr='LOWER')
if '' in perecedingSlice: patterns = [nlp.make_doc(term) for term in terms]
perecedingSlice = perecedingSlice.split('')[1] matcher.add("TermCollocations", patterns)
else:
perecedingSlice = perecedingSlice.strip() matches = matcher(doc)
match = Printer()
for j, start, end in matches:
#perecedingSliceTr = clean(translate(doc[start - 20: start]).text) perecedingSlice = doc[start - 20: start].text
matchedTerm = doc[start:end].text if '' in perecedingSlice:
print(matchedTerm) perecedingSlice = perecedingSlice.split('')[1]
#matchedTerm = doc[start:end].text else:
matchedTermTr = match.text(translate(doc[start:end].text).text, color='red', no_print=True) perecedingSlice = perecedingSlice.strip()
#matchedTermTr = match.text(translate(doc[start:end].text).text)
followingSlice = doc[end:end + 20].text
#followingSliceTr = clean(translate(doc[end:end + 20]).text) #perecedingSliceTr = clean(translate(doc[start - 20: start]).text)
matchedTerm = doc[start:end].text
#context = perecedingSlice+', '+matchedTerm+', '+followingSlice #matchedTerm = doc[start:end].text
matchedTermTr = match.text(translate(doc[start:end].text).text, color='red', no_print=True)
#contextTr = perecedingSliceTr+', '+matchedTermTr+', '+followingSliceTr #matchedTermTr = match.text(translate(doc[start:end].text).text)
#concordances.append({"0 Term": escapeAnscii(matchedTerm), "1 Eng": escapeAnscii(matchedTermTr), "2 Context":escapeAnscii(context), "3 Context Eng":escapeAnscii(contextTr)}) followingSlice = doc[end:end + 20].text
concordances.append({"0 Preceded By":perecedingSlice,"1 Term": matchedTerm, "2 Followed By": followingSlice}) #followingSliceTr = clean(translate(doc[end:end + 20]).text)
#context = perecedingSlice+', '+matchedTerm+', '+followingSlice
#contextTr = perecedingSliceTr+', '+matchedTermTr+', '+followingSliceTr
#concordances.append({"0 Term": escapeAnscii(matchedTerm), "1 Eng": escapeAnscii(matchedTermTr), "2 Context":escapeAnscii(context), "3 Context Eng":escapeAnscii(contextTr)})
concordances.append({"0 Preceded By":perecedingSlice,"1 Term": matchedTerm, "2 Followed By": followingSlice})
result = {'output': concordances, 'message': 'Done', 'code': 'SUCCESS'} result = {'output': concordances, 'message': 'Done', 'code': 'SUCCESS'}
return result return result
#def main():
# result = run_concordance_on_text(page)
#main()
...@@ -12,7 +12,7 @@ def run_multidatasets(): ...@@ -12,7 +12,7 @@ def run_multidatasets():
data = [] data = []
for row in res: for row in res:
print(row) print(row)
data.append({"0 Id": row[0], "1 Title": row[1], "2 Date": row[4],"3 Content":row[-1]}) data.append({"0 Title": row[1], "1 Date": row[4]})
......
...@@ -2,42 +2,54 @@ import torch ...@@ -2,42 +2,54 @@ import torch
from ckip_transformers.nlp import CkipWordSegmenter, CkipPosTagger, CkipNerChunker from ckip_transformers.nlp import CkipWordSegmenter, CkipPosTagger, CkipNerChunker
from transformers import pipeline from transformers import pipeline
import pandas as pd import pandas as pd
from db.db_config import get_db
from shared.translate import translate #from shared.translate import translate
#page = '尼罗河 是一条流經非洲東部與北部的河流,與中非地區的剛果河、非洲南部的赞比西河以及西非地区的尼日尔河並列非洲最大的四個河流系統。' #page = '尼罗河 是一条流經非洲東部與北部的河流,與中非地區的剛果河、非洲南部的赞比西河以及西非地区的尼日尔河並列非洲最大的四個河流系統。'
# Perform NER on Text # Perform NER on Text
def run_ner_on_text(page): def run_ner_on_text(page):
ner_driver = CkipNerChunker(model="bert-base") ner_driver = CkipNerChunker(model="bert-base")
conn, cursor = get_db()
cursor.execute('SELECT * from news;')
res = cursor.fetchall()
data = []
for row in res:
docid = row[0]
content = row[-1].replace('\n', ' ').replace('\t', ' ')
data.append([docid, content])
ner_words_with_count = []
for i in range(0, len(data)):
id = data[i][0]
txt = data[i][1]
ner = ner_driver([txt])
ner = ner_driver([page]) tags = []
for item in ner[0]:
word = item.word
ner = item.ner
tags.append(word+'__'+ner)
tags = []
for item in ner[0]:
word = item.word
ner = item.ner
#idx = item.idx
tags.append(word+'__'+ner)
seen_words = []
for tag in tags:
if tag not in seen_words:
ner_words_with_count = [] freq = tags.count(tag) / 1000
seen_words = [] word = tag.split('__')[0]
for tag in tags: ner = tag.split('__')[1]
if tag not in seen_words: #translation = translate(word).text
if ner == 'PERSON':
ner_words_with_count.append({"0 Word": word, "1 NER": ner, "2 Frequency": freq})
seen_words.append(tag)
freq = tags.count(tag) / 1000000
word = tag.split('__')[0]
ner = tag.split('__')[1]
translation = translate(word).text
ner_words_with_count.append({"0 Word": word, "1 Translation":translation, "2 NER": ner, "3 Frequency": freq})
seen_words.append(tag)
perdoc = sorted(ner_words_with_count, key=lambda x: x["2 Frequency"], reverse=True)[:30]
data = sorted(ner_words_with_count, key=lambda x: x["3 Frequency"], reverse=True)
result = {'output': data,'message': 'Done', 'code': 'SUCCESS'} result = {'output': perdoc,'message': 'Done', 'code': 'SUCCESS'}
return result return result
...@@ -47,3 +59,7 @@ def run_ner_on_text(page): ...@@ -47,3 +59,7 @@ def run_ner_on_text(page):
...@@ -16,39 +16,42 @@ def run_neroverall_on_text(page): ...@@ -16,39 +16,42 @@ def run_neroverall_on_text(page):
data.append([docid, content]) data.append([docid, content])
ner_with_count = [] ner_with_count = []
tags = []
ners = []
seen_words = []
seen_tags = []
for i in range(0,len(data)): for i in range(0,len(data)):
id = data[i][0] id = data[i][0]
txt = data[i][1] txt = data[i][1]
ner = ner_driver([txt]) ner = ner_driver([txt])
tags = []
for item in ner[0]: for item in ner[0]:
word = item.word word = item.word
ner = item.ner ner = item.ner
tags.append(word + '__' + ner) tags.append(word + '__' + ner)
ners = []
seen_words = []
seen_tags = []
for tag in tags: for tag in tags:
if tag not in seen_words: if tag not in seen_words:
ner = tag.split('__')[1].strip() ner = tag.split('__')[1].strip()
ners.append(ner) ners.append(ner)
seen_words.append(tag) seen_words.append(tag)
for n in ners: for n in ners:
if n not in seen_tags: if n not in seen_tags:
freq = ners.count(n) / 1000000 freq = ners.count(n) / 1000
ner_with_count.append({"0 Doc Id": id, "1 NER": n, "2 Frequency": freq}) ner_with_count.append({"1 NER": n, "2 Frequency": freq})
seen_tags.append(n) seen_tags.append(n)
#nerall = sorted(ner_with_count, key=lambda x: (x["0 Doc Id"],x["2 Frequency"]), reverse=True) nerall = sorted(ner_with_count, key=lambda x: x["2 Frequency"], reverse=True)
result = {'output': ner_with_count,'message': 'Done', 'code': 'SUCCESS'}
result = {'output': nerall,'message': 'Done', 'code': 'SUCCESS'}
return result return result
...@@ -8,30 +8,53 @@ from transformers import ( ...@@ -8,30 +8,53 @@ from transformers import (
from transformers import pipeline from transformers import pipeline
import re import re
from db.db_config import get_db
def zng(paragraph): def zng(paragraph):
for sent in re.findall(u'[^!?。\.\!\?]+[!?。\.\!\?]?', paragraph, flags=re.U): for sent in re.findall(u'[^!?。\.\!\?]+[!?。\.\!\?]?', paragraph, flags=re.U):
yield sent yield sent
#page = '尼罗河 是一条流經非洲東部與北部的河流,與中非地區的剛果河、非洲南部的赞比西河以及西非地区的尼日尔河並列非洲最大的四個河流系統。尼罗河 是一条流經非洲東部與北部的河流,與中非地區的剛果河、非洲南部的赞比西河以及西非地区的尼日尔河並列非洲最大的四個河流系統。'
def run_sentiment_on_text(page):
pagesList = list(zng(page)) def run_sentiment_on_text(page):
conn, cursor = get_db()
cursor.execute('SELECT * from news;')
res = cursor.fetchall()
data = []
sentiments = []
for row in res:
docid = row[0]
content = row[-1].replace('\n', ' ').replace('\t', ' ')
data.append([docid, content])
allsentences = []
for i in range(0, len(data)):
txt = data[i][1]
pagesList = list(zng(txt))
allsentences.append(pagesList)
allsentences = [x for xs in allsentences for x in xs]
tokenizer = BertTokenizerFast.from_pretrained('bardsai/finance-sentiment-zh-base') tokenizer = BertTokenizerFast.from_pretrained('bardsai/finance-sentiment-zh-base')
model = AutoModelForSequenceClassification.from_pretrained('bardsai/finance-sentiment-zh-base') model = AutoModelForSequenceClassification.from_pretrained('bardsai/finance-sentiment-zh-base')
nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer) nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
counts = dict() counts = dict()
for p in pagesList: for p in allsentences:
res = nlp(p)[0]['label'] res = nlp(p)[0]['label']
counts[res] = counts.get(res, 0) + 1 counts[res] = counts.get(res, 0) + 1
if 'negative' not in counts.keys(): if 'negative' not in counts.keys():
counts['negative'] = 0 counts['negative'] = 0
sentiments = []
for k in counts.keys(): for k in counts.keys():
sentiments.append({"0 Sentiment": k, "1 Count": counts[k]}) sentiments.append({"0 Sentiment": k, "1 Count": counts[k]})
sentiments = sorted(sentiments, key=lambda x: x["1 Count"], reverse=True)
result = {'output': sentiments, 'message': 'Done', 'code': 'SUCCESS'} result = {'output': sentiments, 'message': 'Done', 'code': 'SUCCESS'}
return result return result
import torch import html_to_json
from ckip_transformers.nlp import CkipWordSegmenter, CkipPosTagger, CkipNerChunker
from transformers import pipeline
import pandas as pd
from shared.translate import translate from shared.translate import translate
#page = '尼罗河 是一条流經非洲東部與北部的河流,與中非地區的剛果河、非洲南部的赞比西河以及西非地区的尼日尔河並列非洲最大的四個河流系統。'
# Perform NER on Text # Perform NER on Text
def run_translation_on_text(page): def run_translation_on_text(page):
try:
translation = '<p>Translation</p>' output_json = html_to_json.convert_tables(page)
translation = translation + '<span>' output = output_json[0]
translation = translation + translate(page).text translated = []
translation = translation +'</span>'
for item in output:
result = {'output': translation,'message': 'Done', 'code': 'SUCCESS'} try:
translated_item = {} # Use a proper dictionary to hold key-value pairs
return result for k, v in item.items():
# Translate text and store in dictionary
translated_item[k] = translate(v).text
#time.sleep(1) # Throttle API calls to avoid exhausting memory and network resources
translated.append(translated_item)
except Exception as e:
# Log or handle translation errors and continue processing other items
print(f"Error translating item {item}: {e}")
result = {'output': translated, 'message': 'Done', 'code': 'SUCCESS'}
return result
except Exception as e:
print(f"Error in run_translation_on_text: {e}")
return {'output': None, 'message': f'Error: {e}', 'code': 'FAILED'}
\ No newline at end of file
import spacy import spacy
from db.db_config import get_db
# Perform USAS on Text # Perform USAS on Text
...@@ -7,7 +8,7 @@ import spacy ...@@ -7,7 +8,7 @@ import spacy
def run_usas_on_text(page): def run_usas_on_text(page):
d = {} d = {}
with open('/Users/tom/PycharmProjects/cognistance/func/usas/usas_desc.txt') as f: with open('/Users/tom/PycharmProjects/cognistance/func/usas/usas_overall.txt') as f:
for line in f: for line in f:
lineL = line.replace('\n', '').split(' ', 1) lineL = line.replace('\n', '').split(' ', 1)
key = lineL[0].strip() key = lineL[0].strip()
...@@ -22,35 +23,60 @@ def run_usas_on_text(page): ...@@ -22,35 +23,60 @@ def run_usas_on_text(page):
# Adds the Chinese PyMUSAS rule-based tagger to the main spaCy pipeline # Adds the Chinese PyMUSAS rule-based tagger to the main spaCy pipeline
nlp.add_pipe('pymusas_rule_based_tagger', source=chinese_tagger_pipeline) nlp.add_pipe('pymusas_rule_based_tagger', source=chinese_tagger_pipeline)
output_doc = nlp(page) conn, cursor = get_db()
#data = [] cursor.execute('SELECT * from news;')
res = cursor.fetchall()
data = []
for row in res:
docid = row[0]
content = row[-1].replace('\n', ' ').replace('\t', ' ')
data.append([docid, content])
#output_doc = nlp(page)
usas_tags_with_count = []
tags = [] tags = []
seen_tags = []
for i in range(0, len(data)):
id = data[i][0]
txt = data[i][1]
output_doc = nlp(txt)
for token in output_doc:
start, end = token._.pymusas_mwe_indexes[0]
idx = (start, end)
for el in token._.pymusas_tags:
el = el.split('.')[0]
#obj = {"word": token.text, "USAS Tags": el, "idx": idx}
tags.append(el)
#data.append(obj)
usas_tags_with_count = [] for token in output_doc:
seen_tags = [] start, end = token._.pymusas_mwe_indexes[0]
for tag in tags: idx = (start, end)
if tag not in seen_tags:
try: for el in token._.pymusas_tags:
freq = tags.count(tag)/1000000 el = el.split('.')[0][0]
usas_tags_with_count.append({"0 Tag": tag, "1 Definition":d[tag],"2 Frequency": freq}) #obj = {"word": token.text, "USAS Tags": el, "idx": idx}
except KeyError: tags.append(el)
pass #data.append(obj)
seen_tags.append(tag)
usas_tags_with_count = sorted(usas_tags_with_count, key=lambda x: x["2 Frequency"], reverse=True)
result = {'output': usas_tags_with_count, 'message': 'Done', 'code': 'SUCCESS'} for tag in tags:
if tag not in seen_tags:
try:
freq = tags.count(tag)/1000
usas_tags_with_count.append({"0 Tag": tag, "1 Definition":d[tag],"2 Frequency": freq})
except KeyError:
pass
seen_tags.append(tag)
usas_tags_with_count_sorted = sorted(usas_tags_with_count, key=lambda x: x["2 Frequency"], reverse=True)
result = {'output': usas_tags_with_count_sorted, 'message': 'Done', 'code': 'SUCCESS'}
return result return result
import spacy
from db.db_config import get_db
# Perform USAS on Text
#page = '尼罗河 是一条流經非洲東部與北部的河流,與中非地區的剛果河、非洲南部的赞比西河以及西非地区的尼日尔河並列非洲最大的四個河流系統。'
def run_usasFine_on_text(page):
d = {}
with open('/Users/tom/PycharmProjects/cognistance/func/usas/usas_desc.txt') as f:
for line in f:
lineL = line.replace('\n', '').split(' ', 1)
key = lineL[0].strip()
val = lineL[1].strip()
d[key] = val
print(d)
# We exclude the following components as we do not need them.
nlp = spacy.load('zh_core_web_sm', exclude=['parser', 'ner'])
# Load the Chinese PyMUSAS rule-based tagger in a separate spaCy pipeline
chinese_tagger_pipeline = spacy.load('cmn_dual_upos2usas_contextual')
# Adds the Chinese PyMUSAS rule-based tagger to the main spaCy pipeline
nlp.add_pipe('pymusas_rule_based_tagger', source=chinese_tagger_pipeline)
conn, cursor = get_db()
cursor.execute('SELECT * from news;')
res = cursor.fetchall()
data = []
for row in res:
docid = row[0]
content = row[-1].replace('\n', ' ').replace('\t', ' ')
data.append([docid, content])
#output_doc = nlp(page)
usas_tags_with_count = []
tags = []
seen_tags = []
for i in range(0, len(data)):
id = data[i][0]
txt = data[i][1]
output_doc = nlp(txt)
for token in output_doc:
start, end = token._.pymusas_mwe_indexes[0]
idx = (start, end)
for el in token._.pymusas_tags:
el = el.split('.')[0]
#obj = {"word": token.text, "USAS Tags": el, "idx": idx}
#tags.append(el)
word = token.text
tags.append(word + '__' + el)
#data.append(obj)
for tag in tags:
if tag not in seen_tags:
try:
freq = tags.count(tag)/1000
word = tag.split('__')[0]
usas = tag.split('__')[1]
if 'A' in usas:
tag_object = {"0 Word":word,"1 Discourse Field": usas, "2 Definition":d[usas],"3 Frequency": freq}
usas_tags_with_count.append(tag_object)
except KeyError:
pass
seen_tags.append(tag)
usas_tags_with_count_sorted = sorted(usas_tags_with_count, key=lambda x: x["3 Frequency"], reverse=False)[:30]
result = {'output': usas_tags_with_count_sorted, 'message': 'Done', 'code': 'SUCCESS'}
return result
...@@ -45,6 +45,14 @@ def usas(): ...@@ -45,6 +45,14 @@ def usas():
return result return result
@app.route("/usasFine", methods=['POST'])
def usasFine():
request_data = request.get_json()
page = request_data['page']
result = get_usasFine_for_data(page)
return result
@app.route("/sentiment", methods=['POST']) @app.route("/sentiment", methods=['POST'])
def sentiment(): def sentiment():
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment