From f23a52a75b5f0118f56351dc26e062f645556592 Mon Sep 17 00:00:00 2001 From: Tom Edwards <edwardstj1@cardiff.ac.uk> Date: Tue, 17 Dec 2024 04:30:49 +0000 Subject: [PATCH] demo --- api/api_functions.py | 9 ++ func/collocation/collocation.py | 74 +++++++----- func/concordance/concordance.py | 167 +++++++++++++++------------- func/mutlidatasets/multidatasets.py | 2 +- func/ner/ner.py | 56 ++++++---- func/neroverall/neroverall.py | 21 ++-- func/sentiment/sentiment.py | 33 +++++- func/translation/translation.py | 48 ++++---- func/usas/usas.py | 74 ++++++++---- func/usasFine/usasFine.py | 85 ++++++++++++++ main.py | 8 ++ 11 files changed, 387 insertions(+), 190 deletions(-) create mode 100644 func/usasFine/usasFine.py diff --git a/api/api_functions.py b/api/api_functions.py index 61afca1..d4d5cd1 100644 --- a/api/api_functions.py +++ b/api/api_functions.py @@ -8,6 +8,7 @@ from func.collocation.collocation import * from func.concordance.concordance import * from func.mutlidatasets.multidatasets import * from func.neroverall.neroverall import * +from func.usasFine.usasFine import * # Perform NER on a file # TAKES XML text page @@ -44,6 +45,14 @@ def get_usas_for_data(page): return make_response(jsonify(result), 400) +def get_usasFine_for_data(page): + result = run_usasFine_on_text(page) + + if result["code"] == "SUCCESS": + return make_response(jsonify(result), 201) + + return make_response(jsonify(result), 400) + # Perform Sentiment analysis on a file # TAKES XML text page diff --git a/func/collocation/collocation.py b/func/collocation/collocation.py index 64396ea..6ca9db5 100644 --- a/func/collocation/collocation.py +++ b/func/collocation/collocation.py @@ -1,65 +1,79 @@ import spacy import math from shared.translate import translate +import nltk from nltk.collocations import TrigramCollocationFinder from nltk.metrics import TrigramAssocMeasures from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures +from db.db_config import get_db -#page = '专精特新â€ä¼ä¸šï¼Œæ˜¯æŒ‡å…·æœ‰ä¸“业化ã€ç²¾ç»†åŒ–ã€ç‰¹è‰²åŒ–ã€æ–°é¢–化四大特å¾çš„ä¸å°ä¼ä¸šã€‚创新是这类ä¼ä¸šçš„çµé‚,足够的 ç ”å‘费用投入则是开展创新的é‡è¦ä¿éšœã€‚许多尚处在æˆé•¿æœŸçš„“专精特新â€ä¼ä¸šï¼Œè¿‘期普ééé‡â€œé’±ç´§â€éš¾é¢˜ã€‚å¦‚ä½•é›†èš æ›´å¤šçš„èµ„é‡‘æŠ•å…¥ç ”å‘ã€ä¿æŒåˆ›æ–°é¢†å…ˆåœ°ä½æ˜¯è¿™äº›ä¼ä¸šè¿‘æ¥é¢ä¸´çš„最大烦æ¼ã€‚“作为一家新ææ–™ç ”å‘å…¬å¸ï¼Œåˆ›æ–°æ˜¯æˆ‘们å‘展的é‡è¦é©±åŠ¨åŠ›ï¼Œåªæœ‰ç ”å‘投入的ä¸æ–åŠ ç ,ä¼ä¸šåˆ›æ–°å‘展的æ¥ä¼æ‰ä¸ä¼š é™é€Ÿã€‚â€æµ™æ±Ÿçœâ€œä¸“精特新â€ä¼ä¸šã€å®æ³¢åˆ›æ¶¦æ–°æ料有é™å…¬å¸è‘£äº‹é•¿å´æ™¯æ™–说,过去3年,ä¼ä¸šåœ¨ç ”å‘投入方é¢ä¸é—余力,累计投入2500万元,这对ä¼ä¸šæ¥è¯´ä¸æ˜¯ä¸ªå°æ•°ç›®ã€‚ ä»Šå¹´æ–°å…´å¸‚åœºçš„ç ”å‘需求ååˆ†è¿«åˆ‡ï¼Œæˆ‘ä»¬ä¸€ç›´æƒ³åŠ å¿« 超高纯钛åŠé’›åˆé‡‘ä¸è¯•ç”Ÿäº§çº¿é¡¹ç›® çš„ç ”å‘è¿›åº¦ï¼Œä½†è‹¦äºŽèµ„é‡‘ä¸ è¶³ã€‚ä»¤äººé«˜å…´çš„æ˜¯ï¼Œä»Šå¹´4月340万元å˜é‡å¢žå€¼ç¨Žç•™æŠµç¨Žé¢çš„到账,有效缓解了ä¼ä¸šçš„èµ„é‡‘åŽ‹åŠ›ï¼ŒåŠ å¿«äº†ä¼ä¸šçš„ç ”å‘ è¿›åº¦ã€‚â€å´æ™¯æ™–说,目å‰ï¼Œâ€œè¶…高纯钛åŠé’›åˆé‡‘ä¸è¯•ç”Ÿäº§çº¿é¡¹ç›®â€æ£åœ¨æœ‰åºæŽ¨è¿›ï¼Œä¸€æ—¦æŠ•äº§å°†ç¼“解åŠå¯¼ä½“产业的高纯钛原æ料供应ä¸è¶³é—®é¢˜ï¼Œæå‡å›½äº§æº…å°„é¶æ的市场竞争力' +def run_collocation_on_text(page): + collocations = [] + nlp = spacy.load('zh_core_web_sm') + conn, cursor = get_db() + cursor.execute('SELECT * from news;') + res = cursor.fetchall() + data = [] -def clean(text): + for row in res: - text = text.replace('<p>', ' ') - text = text.replace('</p>', ' ') - text = text.replace('<br>', ' ') - text = text.replace('</br>', ' ') - text = text.replace('><', ' ') - text = text.replace('\u3000', ' ') - text = text.replace('br', ' ') - cltext = text.replace('\n', ' ').strip() - return str(cltext) + docid = row[0] -def run_collocation_on_text(page): + content = row[-1].replace('\n', ' ').replace('\t', ' ') - page = clean(page) + data.append([docid, content]) corpus = [] - collocations = [] + for i in range(0, len(data)): + id = data[i][0] + txt = data[i][1] + doc = nlp(txt) + - nlp = spacy.load('zh_core_web_sm') - doc = nlp(page) - for token in doc: - if not token.is_stop: - # print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,token.shape_, token.is_alpha, token.is_stop) - corpus.append(token.text.lower()) + for token in doc: + if not token.is_stop: + # print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,token.shape_, token.is_alpha, token.is_stop) + corpus.append(token.text.lower()) biagram_collocation = BigramCollocationFinder.from_words(corpus) #biagram_collocation.apply_freq_filter(3) - trigram_collocation = TrigramCollocationFinder.from_words(corpus) + #trigram_collocation = TrigramCollocationFinder.from_words(corpus) #trigram_collocation.apply_freq_filter(3) - scoredbigrams = biagram_collocation.score_ngrams(BigramAssocMeasures().likelihood_ratio)[:10] - scoretrigrams = trigram_collocation.score_ngrams(TrigramAssocMeasures().likelihood_ratio)[:10] - allscores = scoredbigrams+scoretrigrams - for item in allscores: + scoredbigrams = biagram_collocation.score_ngrams(BigramAssocMeasures().likelihood_ratio) + bigramterms = [] + for j in scoredbigrams: + jstr = " ".join(b for b in j[0]) + bigramterms.append(jstr) + + #scoretrigrams = trigram_collocation.score_ngrams(TrigramAssocMeasures().likelihood_ratio) + #allscores = scoredbigrams+scoretrigrams + for item in scoredbigrams: itemstr = " ".join(i for i in item[0]) - translation = translate(itemstr).text.lower() - score = item[1]/1000000 - collocations.append({"0 Term": itemstr,"1 Translation":translation ,"2 LogRatio": score}) + if '部' in itemstr: + itemstrnew = itemstr.replace('部','').strip().replace(' ','') + translation = translate(itemstr.replace('部','').strip()).text.lower() + #print(translation) + #print('--------------') + score = round(item[1],3) + freq = bigramterms.count(itemstr)/ 1000 + collocations.append({"0 Collocate": itemstrnew , "1 LogRatio": score, "2 Frequency":freq}) - collocations = sorted(collocations, key=lambda x: x["2 LogRatio"], reverse=True) + collocationsorted = sorted(collocations, key=lambda x: (x["1 LogRatio"],x["2 Frequency"]), reverse=True)[:10] - result = {'output': collocations, 'message': 'Done', 'code': 'SUCCESS'} + result = {'output': collocationsorted, 'message': 'Done', 'code': 'SUCCESS'} return result + + diff --git a/func/concordance/concordance.py b/func/concordance/concordance.py index c716d20..b9fe3fd 100644 --- a/func/concordance/concordance.py +++ b/func/concordance/concordance.py @@ -1,111 +1,120 @@ import spacy -import math -from collections import Counter, defaultdict from shared.translate import translate from wasabi import Printer from spacy.matcher import PhraseMatcher -import re -from nltk.collocations import TrigramCollocationFinder -from nltk.metrics import TrigramAssocMeasures +from db.db_config import get_db from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures -#page = '专精特新â€ä¼ä¸šï¼Œæ˜¯æŒ‡å…·æœ‰ä¸“业化ã€ç²¾ç»†åŒ–ã€ç‰¹è‰²åŒ–ã€æ–°é¢–化四大特å¾çš„ä¸å°ä¼ä¸šã€‚创新是这类ä¼ä¸šçš„çµé‚,足够的 ç ”å‘费用投入则是开展创新的é‡è¦ä¿éšœã€‚许多尚处在æˆé•¿æœŸçš„“专精特新â€ä¼ä¸šï¼Œè¿‘期普ééé‡â€œé’±ç´§â€éš¾é¢˜ã€‚å¦‚ä½•é›†èš æ›´å¤šçš„èµ„é‡‘æŠ•å…¥ç ”å‘ã€ä¿æŒåˆ›æ–°é¢†å…ˆåœ°ä½æ˜¯è¿™äº›ä¼ä¸šè¿‘æ¥é¢ä¸´çš„最大烦æ¼ã€‚“作为一家新ææ–™ç ”å‘å…¬å¸ï¼Œåˆ›æ–°æ˜¯æˆ‘们å‘展的é‡è¦é©±åŠ¨åŠ›ï¼Œåªæœ‰ç ”å‘投入的ä¸æ–åŠ ç ,ä¼ä¸šåˆ›æ–°å‘展的æ¥ä¼æ‰ä¸ä¼š é™é€Ÿã€‚â€æµ™æ±Ÿçœâ€œä¸“精特新â€ä¼ä¸šã€å®æ³¢åˆ›æ¶¦æ–°æ料有é™å…¬å¸è‘£äº‹é•¿å´æ™¯æ™–说,过去3年,ä¼ä¸šåœ¨ç ”å‘投入方é¢ä¸é—余力,累计投入2500万元,这对ä¼ä¸šæ¥è¯´ä¸æ˜¯ä¸ªå°æ•°ç›®ã€‚ ä»Šå¹´æ–°å…´å¸‚åœºçš„ç ”å‘需求ååˆ†è¿«åˆ‡ï¼Œæˆ‘ä»¬ä¸€ç›´æƒ³åŠ å¿« 超高纯钛åŠé’›åˆé‡‘ä¸è¯•ç”Ÿäº§çº¿é¡¹ç›® çš„ç ”å‘è¿›åº¦ï¼Œä½†è‹¦äºŽèµ„é‡‘ä¸ è¶³ã€‚ä»¤äººé«˜å…´çš„æ˜¯ï¼Œä»Šå¹´4月340万元å˜é‡å¢žå€¼ç¨Žç•™æŠµç¨Žé¢çš„到账,有效缓解了ä¼ä¸šçš„èµ„é‡‘åŽ‹åŠ›ï¼ŒåŠ å¿«äº†ä¼ä¸šçš„ç ”å‘ è¿›åº¦ã€‚â€å´æ™¯æ™–说,目å‰ï¼Œâ€œè¶…高纯钛åŠé’›åˆé‡‘ä¸è¯•ç”Ÿäº§çº¿é¡¹ç›®â€æ£åœ¨æœ‰åºæŽ¨è¿›ï¼Œä¸€æ—¦æŠ•äº§å°†ç¼“解åŠå¯¼ä½“产业的高纯钛原æ料供应ä¸è¶³é—®é¢˜ï¼Œæå‡å›½äº§æº…å°„é¶æ的市场竞争力' -def clean(text): +def collocations(): + collocations = [] - text = text.replace('<p>', ' ') - text = text.replace('</p>', ' ') - text = text.replace('<br>', ' ') - text = text.replace('</br>', ' ') - text = text.replace('><', ' ') - text = text.replace('\u3000', ' ') - text = text.replace('br', ' ') - text = text.replace('——', '') - text = text.replace('[38;5;1m', '') - text = text.replace('[0m','') - cltext = text.replace('\n', ' ').strip() - return str(cltext) + nlp = spacy.load('zh_core_web_sm') + conn, cursor = get_db() + cursor.execute('SELECT * from news;') + res = cursor.fetchall() -def collocations(doc): - corpus = [] - collocations = [] + data = [] - for token in doc: + for row in res: + docid = row[0] - if not token.is_stop: - # print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,token.shape_, token.is_alpha, token.is_stop) - corpus.append(token.text.lower()) + content = row[-1].replace('\n', ' ').replace('\t', ' ') - biagram_collocation = BigramCollocationFinder.from_words(corpus) - # biagram_collocation.apply_freq_filter(3) - trigram_collocation = TrigramCollocationFinder.from_words(corpus) - # trigram_collocation.apply_freq_filter(3) + data.append([docid, content]) - scoredbigrams = biagram_collocation.score_ngrams(BigramAssocMeasures().likelihood_ratio)[:10] + corpus = [] + for i in range(0, len(data)): + txt = data[i][1] + doc = nlp(txt) - scoretrigrams = trigram_collocation.score_ngrams(TrigramAssocMeasures().likelihood_ratio)[:10] - allscores = scoredbigrams + scoretrigrams - for item in allscores: - itemstr = " ".join(i for i in item[0]) - translation = translate(itemstr).text.lower() - score = item[1] / 1000000 - collocations.append({"0 Term": itemstr, "1 Translation": translation, "2 LogRatio": score}) + for token in doc: + if not token.is_stop: + # print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,token.shape_, token.is_alpha, token.is_stop) + corpus.append(token.text.lower()) + + biagram_collocation = BigramCollocationFinder.from_words(corpus) - collocations = sorted(collocations, key=lambda x: x["2 LogRatio"], reverse=True) + scoredbigrams = biagram_collocation.score_ngrams(BigramAssocMeasures().likelihood_ratio) + bigramterms = [] + for j in scoredbigrams: + jstr = " ".join(b for b in j[0]) + bigramterms.append(jstr) + + # scoretrigrams = trigram_collocation.score_ngrams(TrigramAssocMeasures().likelihood_ratio) + # allscores = scoredbigrams+scoretrigrams + for item in scoredbigrams: + itemstr = " ".join(i for i in item[0]) + if '部' in itemstr: + itemstrnew = itemstr + translation = translate(itemstr).text.lower() + # print(translation) + # print('--------------') + score = round(item[1], 3) - terms = [item.get('0 Term') for item in collocations] + freq = bigramterms.count(itemstr) / 1000 + collocations.append({"0 Collocate": itemstrnew, "1 LogRatio": score, "2 Frequency": freq}) + collocationsorted = sorted(collocations, key=lambda x: (x["1 LogRatio"], x["2 Frequency"]), reverse=True)[:10] - return terms + terms = [d['0 Collocate'] for d in collocationsorted] + return terms def run_concordance_on_text(page): - page = clean(page) - print('Page') - print(page) nlp = spacy.load('zh_core_web_sm') - doc = nlp(page) - terms = collocations(doc) + conn, cursor = get_db() + cursor.execute('SELECT * from news;') + res = cursor.fetchall() + data = [] + for row in res: + docid = row[0] + content = row[-1].replace('\n', ' ').replace('\t', ' ') + data.append([docid, content]) + concordances = [] - matcher = PhraseMatcher(nlp.vocab,attr='LOWER') - patterns = [nlp.make_doc(term) for term in terms] - matcher.add("TermCollocations", patterns) - - matches = matcher(doc) - match = Printer() - for i, start, end in matches: - perecedingSlice = doc[start - 20: start].text - if '。' in perecedingSlice: - perecedingSlice = perecedingSlice.split('。')[1] - else: - perecedingSlice = perecedingSlice.strip() - - - #perecedingSliceTr = clean(translate(doc[start - 20: start]).text) - matchedTerm = doc[start:end].text - print(matchedTerm) - #matchedTerm = doc[start:end].text - matchedTermTr = match.text(translate(doc[start:end].text).text, color='red', no_print=True) - #matchedTermTr = match.text(translate(doc[start:end].text).text) - followingSlice = doc[end:end + 20].text - #followingSliceTr = clean(translate(doc[end:end + 20]).text) - - #context = perecedingSlice+', '+matchedTerm+', '+followingSlice - - #contextTr = perecedingSliceTr+', '+matchedTermTr+', '+followingSliceTr - #concordances.append({"0 Term": escapeAnscii(matchedTerm), "1 Eng": escapeAnscii(matchedTermTr), "2 Context":escapeAnscii(context), "3 Context Eng":escapeAnscii(contextTr)}) - concordances.append({"0 Preceded By":perecedingSlice,"1 Term": matchedTerm, "2 Followed By": followingSlice}) - - + terms = collocations() + for i in range(0, len(data)): + id = data[i][0] + txt = data[i][1] + doc = nlp(txt) + + + matcher = PhraseMatcher(nlp.vocab,attr='LOWER') + patterns = [nlp.make_doc(term) for term in terms] + matcher.add("TermCollocations", patterns) + + matches = matcher(doc) + match = Printer() + for j, start, end in matches: + perecedingSlice = doc[start - 20: start].text + if '。' in perecedingSlice: + perecedingSlice = perecedingSlice.split('。')[1] + else: + perecedingSlice = perecedingSlice.strip() + + + #perecedingSliceTr = clean(translate(doc[start - 20: start]).text) + matchedTerm = doc[start:end].text + #matchedTerm = doc[start:end].text + matchedTermTr = match.text(translate(doc[start:end].text).text, color='red', no_print=True) + #matchedTermTr = match.text(translate(doc[start:end].text).text) + followingSlice = doc[end:end + 20].text + #followingSliceTr = clean(translate(doc[end:end + 20]).text) + + #context = perecedingSlice+', '+matchedTerm+', '+followingSlice + + #contextTr = perecedingSliceTr+', '+matchedTermTr+', '+followingSliceTr + #concordances.append({"0 Term": escapeAnscii(matchedTerm), "1 Eng": escapeAnscii(matchedTermTr), "2 Context":escapeAnscii(context), "3 Context Eng":escapeAnscii(contextTr)}) + concordances.append({"0 Preceded By":perecedingSlice,"1 Term": matchedTerm, "2 Followed By": followingSlice}) + + result = {'output': concordances, 'message': 'Done', 'code': 'SUCCESS'} return result -#def main(): -# result = run_concordance_on_text(page) -#main() diff --git a/func/mutlidatasets/multidatasets.py b/func/mutlidatasets/multidatasets.py index 00c845c..6548a34 100644 --- a/func/mutlidatasets/multidatasets.py +++ b/func/mutlidatasets/multidatasets.py @@ -12,7 +12,7 @@ def run_multidatasets(): data = [] for row in res: print(row) - data.append({"0 Id": row[0], "1 Title": row[1], "2 Date": row[4],"3 Content":row[-1]}) + data.append({"0 Title": row[1], "1 Date": row[4]}) diff --git a/func/ner/ner.py b/func/ner/ner.py index a9d8977..e783903 100644 --- a/func/ner/ner.py +++ b/func/ner/ner.py @@ -2,42 +2,54 @@ import torch from ckip_transformers.nlp import CkipWordSegmenter, CkipPosTagger, CkipNerChunker from transformers import pipeline import pandas as pd +from db.db_config import get_db -from shared.translate import translate +#from shared.translate import translate #page = '尼罗河 是一æ¡æµç¶“éžæ´²æ±éƒ¨èˆ‡åŒ—部的河æµï¼Œèˆ‡ä¸éžåœ°å€çš„剛果河ã€éžæ´²å—部的赞比西河以åŠè¥¿éžåœ°åŒºçš„尼日尔河並列éžæ´²æœ€å¤§çš„四個河æµç³»çµ±ã€‚' # Perform NER on Text def run_ner_on_text(page): ner_driver = CkipNerChunker(model="bert-base") + conn, cursor = get_db() + cursor.execute('SELECT * from news;') + res = cursor.fetchall() + data = [] + for row in res: + docid = row[0] + content = row[-1].replace('\n', ' ').replace('\t', ' ') + data.append([docid, content]) + ner_words_with_count = [] + for i in range(0, len(data)): + id = data[i][0] + txt = data[i][1] + ner = ner_driver([txt]) - ner = ner_driver([page]) + tags = [] + for item in ner[0]: + word = item.word + ner = item.ner + tags.append(word+'__'+ner) - tags = [] - for item in ner[0]: - word = item.word - ner = item.ner - #idx = item.idx - tags.append(word+'__'+ner) + seen_words = [] + for tag in tags: + if tag not in seen_words: - ner_words_with_count = [] - seen_words = [] - for tag in tags: - if tag not in seen_words: + freq = tags.count(tag) / 1000 + word = tag.split('__')[0] + ner = tag.split('__')[1] + #translation = translate(word).text + if ner == 'PERSON': + ner_words_with_count.append({"0 Word": word, "1 NER": ner, "2 Frequency": freq}) + seen_words.append(tag) - freq = tags.count(tag) / 1000000 - word = tag.split('__')[0] - ner = tag.split('__')[1] - translation = translate(word).text - ner_words_with_count.append({"0 Word": word, "1 Translation":translation, "2 NER": ner, "3 Frequency": freq}) - seen_words.append(tag) + perdoc = sorted(ner_words_with_count, key=lambda x: x["2 Frequency"], reverse=True)[:30] - data = sorted(ner_words_with_count, key=lambda x: x["3 Frequency"], reverse=True) - result = {'output': data,'message': 'Done', 'code': 'SUCCESS'} + result = {'output': perdoc,'message': 'Done', 'code': 'SUCCESS'} return result @@ -47,3 +59,7 @@ def run_ner_on_text(page): + + + + diff --git a/func/neroverall/neroverall.py b/func/neroverall/neroverall.py index 3c22352..32f79ab 100644 --- a/func/neroverall/neroverall.py +++ b/func/neroverall/neroverall.py @@ -16,39 +16,42 @@ def run_neroverall_on_text(page): data.append([docid, content]) ner_with_count = [] + tags = [] + ners = [] + seen_words = [] + seen_tags = [] + for i in range(0,len(data)): id = data[i][0] txt = data[i][1] ner = ner_driver([txt]) - tags = [] + for item in ner[0]: word = item.word ner = item.ner tags.append(word + '__' + ner) - ners = [] - seen_words = [] - seen_tags = [] for tag in tags: if tag not in seen_words: ner = tag.split('__')[1].strip() - ners.append(ner) seen_words.append(tag) for n in ners: if n not in seen_tags: - freq = ners.count(n) / 1000000 - ner_with_count.append({"0 Doc Id": id, "1 NER": n, "2 Frequency": freq}) + freq = ners.count(n) / 1000 + ner_with_count.append({"1 NER": n, "2 Frequency": freq}) seen_tags.append(n) - #nerall = sorted(ner_with_count, key=lambda x: (x["0 Doc Id"],x["2 Frequency"]), reverse=True) + nerall = sorted(ner_with_count, key=lambda x: x["2 Frequency"], reverse=True) - result = {'output': ner_with_count,'message': 'Done', 'code': 'SUCCESS'} + + result = {'output': nerall,'message': 'Done', 'code': 'SUCCESS'} return result + diff --git a/func/sentiment/sentiment.py b/func/sentiment/sentiment.py index 34d4fda..c12a014 100644 --- a/func/sentiment/sentiment.py +++ b/func/sentiment/sentiment.py @@ -8,30 +8,53 @@ from transformers import ( from transformers import pipeline import re +from db.db_config import get_db def zng(paragraph): for sent in re.findall(u'[^!?。\.\!\?]+[!?。\.\!\?]?', paragraph, flags=re.U): yield sent -#page = '尼罗河 是一æ¡æµç¶“éžæ´²æ±éƒ¨èˆ‡åŒ—部的河æµï¼Œèˆ‡ä¸éžåœ°å€çš„剛果河ã€éžæ´²å—部的赞比西河以åŠè¥¿éžåœ°åŒºçš„尼日尔河並列éžæ´²æœ€å¤§çš„四個河æµç³»çµ±ã€‚尼罗河 是一æ¡æµç¶“éžæ´²æ±éƒ¨èˆ‡åŒ—部的河æµï¼Œèˆ‡ä¸éžåœ°å€çš„剛果河ã€éžæ´²å—部的赞比西河以åŠè¥¿éžåœ°åŒºçš„尼日尔河並列éžæ´²æœ€å¤§çš„四個河æµç³»çµ±ã€‚' -def run_sentiment_on_text(page): - pagesList = list(zng(page)) +def run_sentiment_on_text(page): + conn, cursor = get_db() + cursor.execute('SELECT * from news;') + res = cursor.fetchall() + data = [] + sentiments = [] + for row in res: + docid = row[0] + content = row[-1].replace('\n', ' ').replace('\t', ' ') + data.append([docid, content]) + + allsentences = [] + for i in range(0, len(data)): + txt = data[i][1] + pagesList = list(zng(txt)) + allsentences.append(pagesList) + + allsentences = [x for xs in allsentences for x in xs] tokenizer = BertTokenizerFast.from_pretrained('bardsai/finance-sentiment-zh-base') model = AutoModelForSequenceClassification.from_pretrained('bardsai/finance-sentiment-zh-base') nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer) counts = dict() - for p in pagesList: + for p in allsentences: res = nlp(p)[0]['label'] counts[res] = counts.get(res, 0) + 1 if 'negative' not in counts.keys(): counts['negative'] = 0 - sentiments = [] + for k in counts.keys(): sentiments.append({"0 Sentiment": k, "1 Count": counts[k]}) + sentiments = sorted(sentiments, key=lambda x: x["1 Count"], reverse=True) result = {'output': sentiments, 'message': 'Done', 'code': 'SUCCESS'} + return result + + + + + diff --git a/func/translation/translation.py b/func/translation/translation.py index 6a03d67..190bd6b 100644 --- a/func/translation/translation.py +++ b/func/translation/translation.py @@ -1,26 +1,30 @@ -import torch -from ckip_transformers.nlp import CkipWordSegmenter, CkipPosTagger, CkipNerChunker -from transformers import pipeline -import pandas as pd - +import html_to_json from shared.translate import translate -#page = '尼罗河 是一æ¡æµç¶“éžæ´²æ±éƒ¨èˆ‡åŒ—部的河æµï¼Œèˆ‡ä¸éžåœ°å€çš„剛果河ã€éžæ´²å—部的赞比西河以åŠè¥¿éžåœ°åŒºçš„尼日尔河並列éžæ´²æœ€å¤§çš„四個河æµç³»çµ±ã€‚' + # Perform NER on Text def run_translation_on_text(page): - - translation = '<p>Translation</p>' - translation = translation + '<span>' - translation = translation + translate(page).text - translation = translation +'</span>' - - result = {'output': translation,'message': 'Done', 'code': 'SUCCESS'} - - return result - - - - - - - + try: + output_json = html_to_json.convert_tables(page) + output = output_json[0] + translated = [] + + for item in output: + try: + translated_item = {} # Use a proper dictionary to hold key-value pairs + for k, v in item.items(): + # Translate text and store in dictionary + translated_item[k] = translate(v).text + #time.sleep(1) # Throttle API calls to avoid exhausting memory and network resources + + translated.append(translated_item) + except Exception as e: + # Log or handle translation errors and continue processing other items + print(f"Error translating item {item}: {e}") + + + result = {'output': translated, 'message': 'Done', 'code': 'SUCCESS'} + return result + except Exception as e: + print(f"Error in run_translation_on_text: {e}") + return {'output': None, 'message': f'Error: {e}', 'code': 'FAILED'} \ No newline at end of file diff --git a/func/usas/usas.py b/func/usas/usas.py index a5c6a19..bf72130 100644 --- a/func/usas/usas.py +++ b/func/usas/usas.py @@ -1,4 +1,5 @@ import spacy +from db.db_config import get_db # Perform USAS on Text @@ -7,7 +8,7 @@ import spacy def run_usas_on_text(page): d = {} - with open('/Users/tom/PycharmProjects/cognistance/func/usas/usas_desc.txt') as f: + with open('/Users/tom/PycharmProjects/cognistance/func/usas/usas_overall.txt') as f: for line in f: lineL = line.replace('\n', '').split(' ', 1) key = lineL[0].strip() @@ -22,35 +23,60 @@ def run_usas_on_text(page): # Adds the Chinese PyMUSAS rule-based tagger to the main spaCy pipeline nlp.add_pipe('pymusas_rule_based_tagger', source=chinese_tagger_pipeline) - output_doc = nlp(page) - #data = [] + conn, cursor = get_db() + cursor.execute('SELECT * from news;') + res = cursor.fetchall() + data = [] + for row in res: + docid = row[0] + content = row[-1].replace('\n', ' ').replace('\t', ' ') + data.append([docid, content]) + + #output_doc = nlp(page) + usas_tags_with_count = [] tags = [] + seen_tags = [] + for i in range(0, len(data)): + id = data[i][0] + txt = data[i][1] + output_doc = nlp(txt) - for token in output_doc: - start, end = token._.pymusas_mwe_indexes[0] - idx = (start, end) - for el in token._.pymusas_tags: - el = el.split('.')[0] - #obj = {"word": token.text, "USAS Tags": el, "idx": idx} - tags.append(el) - #data.append(obj) - usas_tags_with_count = [] - seen_tags = [] - for tag in tags: - if tag not in seen_tags: - try: - freq = tags.count(tag)/1000000 - usas_tags_with_count.append({"0 Tag": tag, "1 Definition":d[tag],"2 Frequency": freq}) - except KeyError: - pass - seen_tags.append(tag) - - usas_tags_with_count = sorted(usas_tags_with_count, key=lambda x: x["2 Frequency"], reverse=True) - result = {'output': usas_tags_with_count, 'message': 'Done', 'code': 'SUCCESS'} + for token in output_doc: + start, end = token._.pymusas_mwe_indexes[0] + idx = (start, end) + + for el in token._.pymusas_tags: + el = el.split('.')[0][0] + #obj = {"word": token.text, "USAS Tags": el, "idx": idx} + tags.append(el) + #data.append(obj) + + + + for tag in tags: + if tag not in seen_tags: + try: + freq = tags.count(tag)/1000 + + usas_tags_with_count.append({"0 Tag": tag, "1 Definition":d[tag],"2 Frequency": freq}) + except KeyError: + pass + seen_tags.append(tag) + + usas_tags_with_count_sorted = sorted(usas_tags_with_count, key=lambda x: x["2 Frequency"], reverse=True) + + + result = {'output': usas_tags_with_count_sorted, 'message': 'Done', 'code': 'SUCCESS'} return result + + + + + + diff --git a/func/usasFine/usasFine.py b/func/usasFine/usasFine.py new file mode 100644 index 0000000..5255519 --- /dev/null +++ b/func/usasFine/usasFine.py @@ -0,0 +1,85 @@ +import spacy +from db.db_config import get_db + + +# Perform USAS on Text +#page = '尼罗河 是一æ¡æµç¶“éžæ´²æ±éƒ¨èˆ‡åŒ—部的河æµï¼Œèˆ‡ä¸éžåœ°å€çš„剛果河ã€éžæ´²å—部的赞比西河以åŠè¥¿éžåœ°åŒºçš„尼日尔河並列éžæ´²æœ€å¤§çš„四個河æµç³»çµ±ã€‚' + + +def run_usasFine_on_text(page): + d = {} + with open('/Users/tom/PycharmProjects/cognistance/func/usas/usas_desc.txt') as f: + for line in f: + lineL = line.replace('\n', '').split(' ', 1) + key = lineL[0].strip() + val = lineL[1].strip() + d[key] = val + + print(d) + # We exclude the following components as we do not need them. + nlp = spacy.load('zh_core_web_sm', exclude=['parser', 'ner']) + # Load the Chinese PyMUSAS rule-based tagger in a separate spaCy pipeline + chinese_tagger_pipeline = spacy.load('cmn_dual_upos2usas_contextual') + # Adds the Chinese PyMUSAS rule-based tagger to the main spaCy pipeline + nlp.add_pipe('pymusas_rule_based_tagger', source=chinese_tagger_pipeline) + + conn, cursor = get_db() + cursor.execute('SELECT * from news;') + res = cursor.fetchall() + data = [] + + for row in res: + docid = row[0] + content = row[-1].replace('\n', ' ').replace('\t', ' ') + data.append([docid, content]) + + #output_doc = nlp(page) + usas_tags_with_count = [] + tags = [] + seen_tags = [] + for i in range(0, len(data)): + id = data[i][0] + txt = data[i][1] + output_doc = nlp(txt) + + + + for token in output_doc: + start, end = token._.pymusas_mwe_indexes[0] + idx = (start, end) + + for el in token._.pymusas_tags: + el = el.split('.')[0] + #obj = {"word": token.text, "USAS Tags": el, "idx": idx} + #tags.append(el) + word = token.text + tags.append(word + '__' + el) + #data.append(obj) + + + + for tag in tags: + if tag not in seen_tags: + try: + freq = tags.count(tag)/1000 + word = tag.split('__')[0] + usas = tag.split('__')[1] + + if 'A' in usas: + tag_object = {"0 Word":word,"1 Discourse Field": usas, "2 Definition":d[usas],"3 Frequency": freq} + + usas_tags_with_count.append(tag_object) + + except KeyError: + pass + seen_tags.append(tag) + + usas_tags_with_count_sorted = sorted(usas_tags_with_count, key=lambda x: x["3 Frequency"], reverse=False)[:30] + result = {'output': usas_tags_with_count_sorted, 'message': 'Done', 'code': 'SUCCESS'} + + return result + + + + + diff --git a/main.py b/main.py index 422a801..ea0ba98 100644 --- a/main.py +++ b/main.py @@ -45,6 +45,14 @@ def usas(): return result +@app.route("/usasFine", methods=['POST']) +def usasFine(): + request_data = request.get_json() + page = request_data['page'] + result = get_usasFine_for_data(page) + + return result + @app.route("/sentiment", methods=['POST']) def sentiment(): -- GitLab