From f23a52a75b5f0118f56351dc26e062f645556592 Mon Sep 17 00:00:00 2001
From: Tom Edwards <edwardstj1@cardiff.ac.uk>
Date: Tue, 17 Dec 2024 04:30:49 +0000
Subject: [PATCH] demo

---
 api/api_functions.py                |   9 ++
 func/collocation/collocation.py     |  74 +++++++-----
 func/concordance/concordance.py     | 167 +++++++++++++++-------------
 func/mutlidatasets/multidatasets.py |   2 +-
 func/ner/ner.py                     |  56 ++++++----
 func/neroverall/neroverall.py       |  21 ++--
 func/sentiment/sentiment.py         |  33 +++++-
 func/translation/translation.py     |  48 ++++----
 func/usas/usas.py                   |  74 ++++++++----
 func/usasFine/usasFine.py           |  85 ++++++++++++++
 main.py                             |   8 ++
 11 files changed, 387 insertions(+), 190 deletions(-)
 create mode 100644 func/usasFine/usasFine.py

diff --git a/api/api_functions.py b/api/api_functions.py
index 61afca1..d4d5cd1 100644
--- a/api/api_functions.py
+++ b/api/api_functions.py
@@ -8,6 +8,7 @@ from func.collocation.collocation import *
 from func.concordance.concordance import *
 from func.mutlidatasets.multidatasets import *
 from func.neroverall.neroverall import *
+from func.usasFine.usasFine import *
 
 # Perform NER on a file
 # TAKES XML text page
@@ -44,6 +45,14 @@ def get_usas_for_data(page):
 
     return make_response(jsonify(result), 400)
 
+def get_usasFine_for_data(page):
+    result = run_usasFine_on_text(page)
+
+    if result["code"] == "SUCCESS":
+        return make_response(jsonify(result), 201)
+
+    return make_response(jsonify(result), 400)
+
 
 # Perform Sentiment analysis on a file
 # TAKES XML text page
diff --git a/func/collocation/collocation.py b/func/collocation/collocation.py
index 64396ea..6ca9db5 100644
--- a/func/collocation/collocation.py
+++ b/func/collocation/collocation.py
@@ -1,65 +1,79 @@
 import spacy
 import math
 from shared.translate import translate
+import nltk
 from nltk.collocations import TrigramCollocationFinder
 from nltk.metrics import TrigramAssocMeasures
 from nltk.collocations import BigramCollocationFinder
 from nltk.metrics import BigramAssocMeasures
+from db.db_config import get_db
 
 
-#page = '专精特新”企业,是指具有专业化、精细化、特色化、新颖化四大特征的中小企业。创新是这类企业的灵魂,足够的 研发费用投入则是开展创新的重要保障。许多尚处在成长期的“专精特新”企业,近期普遍遭遇“钱紧”难题。如何集聚 更多的资金投入研发、保持创新领先地位是这些企业近来面临的最大烦恼。“作为一家新材料研发公司,创新是我们发展的重要驱动力,只有研发投入的不断加码,企业创新发展的步伐才不会 降速。”浙江省“专精特新”企业、宁波创润新材料有限公司董事长吴景晖说,过去3年,企业在研发投入方面不遗余力,累计投入2500万元,这对企业来说不是个小数目。 今年新兴市场的研发需求十分迫切,我们一直想加快 超高纯钛及钛合金中试生产线项目 的研发进度,但苦于资金不 足。令人高兴的是,今年4月340万元存量增值税留抵税额的到账,有效缓解了企业的资金压力,加快了企业的研发 进度。”吴景晖说,目前,“超高纯钛及钛合金中试生产线项目”正在有序推进,一旦投产将缓解半导体产业的高纯钛原材料供应不足问题,提升国产溅射靶材的市场竞争力'
+def run_collocation_on_text(page):
 
+    collocations = []
 
+    nlp = spacy.load('zh_core_web_sm')
+    conn, cursor = get_db()
+    cursor.execute('SELECT * from news;')
+    res = cursor.fetchall()
 
+    data = []
 
-def clean(text):
+    for row in res:
 
-    text = text.replace('<p>', ' ')
-    text = text.replace('</p>', ' ')
-    text = text.replace('<br>', ' ')
-    text = text.replace('</br>', ' ')
-    text = text.replace('><', ' ')
-    text = text.replace('\u3000', ' ')
-    text = text.replace('br', ' ')
-    cltext = text.replace('\n', ' ').strip()
-    return str(cltext)
+        docid = row[0]
 
-def run_collocation_on_text(page):
+        content = row[-1].replace('\n', ' ').replace('\t', ' ')
 
-    page = clean(page)
+        data.append([docid, content])
 
     corpus = []
-    collocations = []
+    for i in range(0, len(data)):
+        id = data[i][0]
+        txt = data[i][1]
+        doc = nlp(txt)
+
 
-    nlp = spacy.load('zh_core_web_sm')
-    doc = nlp(page)
 
-    for token in doc:
-        if not token.is_stop:
-            # print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,token.shape_, token.is_alpha, token.is_stop)
-            corpus.append(token.text.lower())
+        for token in doc:
+            if not token.is_stop:
+                # print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,token.shape_, token.is_alpha, token.is_stop)
+                corpus.append(token.text.lower())
 
 
     biagram_collocation = BigramCollocationFinder.from_words(corpus)
     #biagram_collocation.apply_freq_filter(3)
-    trigram_collocation = TrigramCollocationFinder.from_words(corpus)
+    #trigram_collocation = TrigramCollocationFinder.from_words(corpus)
     #trigram_collocation.apply_freq_filter(3)
 
-    scoredbigrams = biagram_collocation.score_ngrams(BigramAssocMeasures().likelihood_ratio)[:10]
 
-    scoretrigrams = trigram_collocation.score_ngrams(TrigramAssocMeasures().likelihood_ratio)[:10]
-    allscores = scoredbigrams+scoretrigrams
-    for item in allscores:
+    scoredbigrams = biagram_collocation.score_ngrams(BigramAssocMeasures().likelihood_ratio)
+    bigramterms = []
+    for j in scoredbigrams:
+        jstr = " ".join(b for b in j[0])
+        bigramterms.append(jstr)
+
+    #scoretrigrams = trigram_collocation.score_ngrams(TrigramAssocMeasures().likelihood_ratio)
+    #allscores = scoredbigrams+scoretrigrams
+    for item in scoredbigrams:
         itemstr = " ".join(i for i in item[0])
-        translation = translate(itemstr).text.lower()
-        score = item[1]/1000000
-        collocations.append({"0 Term": itemstr,"1 Translation":translation ,"2 LogRatio": score})
+        if '部' in itemstr:
+            itemstrnew = itemstr.replace('部','').strip().replace(' ','')
+            translation = translate(itemstr.replace('部','').strip()).text.lower()
+            #print(translation)
+            #print('--------------')
+            score = round(item[1],3)
 
+            freq = bigramterms.count(itemstr)/ 1000
+            collocations.append({"0 Collocate": itemstrnew , "1 LogRatio": score, "2 Frequency":freq})
 
-    collocations = sorted(collocations, key=lambda x: x["2 LogRatio"], reverse=True)
 
+    collocationsorted = sorted(collocations, key=lambda x: (x["1 LogRatio"],x["2 Frequency"]), reverse=True)[:10]
 
-    result = {'output': collocations, 'message': 'Done', 'code': 'SUCCESS'}
+    result = {'output': collocationsorted, 'message': 'Done', 'code': 'SUCCESS'}
     return result
 
 
+
+
diff --git a/func/concordance/concordance.py b/func/concordance/concordance.py
index c716d20..b9fe3fd 100644
--- a/func/concordance/concordance.py
+++ b/func/concordance/concordance.py
@@ -1,111 +1,120 @@
 import spacy
-import math
-from collections import Counter, defaultdict
 from shared.translate import translate
 from wasabi import Printer
 from spacy.matcher import PhraseMatcher
-import re
-from nltk.collocations import TrigramCollocationFinder
-from nltk.metrics import TrigramAssocMeasures
+from db.db_config import get_db
 from nltk.collocations import BigramCollocationFinder
 from nltk.metrics import BigramAssocMeasures
 
-#page = '专精特新”企业,是指具有专业化、精细化、特色化、新颖化四大特征的中小企业。创新是这类企业的灵魂,足够的 研发费用投入则是开展创新的重要保障。许多尚处在成长期的“专精特新”企业,近期普遍遭遇“钱紧”难题。如何集聚 更多的资金投入研发、保持创新领先地位是这些企业近来面临的最大烦恼。“作为一家新材料研发公司,创新是我们发展的重要驱动力,只有研发投入的不断加码,企业创新发展的步伐才不会 降速。”浙江省“专精特新”企业、宁波创润新材料有限公司董事长吴景晖说,过去3年,企业在研发投入方面不遗余力,累计投入2500万元,这对企业来说不是个小数目。 今年新兴市场的研发需求十分迫切,我们一直想加快 超高纯钛及钛合金中试生产线项目 的研发进度,但苦于资金不 足。令人高兴的是,今年4月340万元存量增值税留抵税额的到账,有效缓解了企业的资金压力,加快了企业的研发 进度。”吴景晖说,目前,“超高纯钛及钛合金中试生产线项目”正在有序推进,一旦投产将缓解半导体产业的高纯钛原材料供应不足问题,提升国产溅射靶材的市场竞争力'
 
 
-def clean(text):
+def collocations():
+    collocations = []
 
-    text = text.replace('<p>', ' ')
-    text = text.replace('</p>', ' ')
-    text = text.replace('<br>', ' ')
-    text = text.replace('</br>', ' ')
-    text = text.replace('><', ' ')
-    text = text.replace('\u3000', ' ')
-    text = text.replace('br', ' ')
-    text = text.replace('——', '')
-    text = text.replace('[38;5;1m', '')
-    text = text.replace('[0m','')
-    cltext = text.replace('\n', ' ').strip()
-    return str(cltext)
+    nlp = spacy.load('zh_core_web_sm')
+    conn, cursor = get_db()
+    cursor.execute('SELECT * from news;')
+    res = cursor.fetchall()
 
-def collocations(doc):
-    corpus = []
-    collocations = []
+    data = []
 
-    for token in doc:
+    for row in res:
+        docid = row[0]
 
-        if not token.is_stop:
-            # print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,token.shape_, token.is_alpha, token.is_stop)
-            corpus.append(token.text.lower())
+        content = row[-1].replace('\n', ' ').replace('\t', ' ')
 
-    biagram_collocation = BigramCollocationFinder.from_words(corpus)
-    # biagram_collocation.apply_freq_filter(3)
-    trigram_collocation = TrigramCollocationFinder.from_words(corpus)
-    # trigram_collocation.apply_freq_filter(3)
+        data.append([docid, content])
 
-    scoredbigrams = biagram_collocation.score_ngrams(BigramAssocMeasures().likelihood_ratio)[:10]
+    corpus = []
+    for i in range(0, len(data)):
+        txt = data[i][1]
+        doc = nlp(txt)
 
-    scoretrigrams = trigram_collocation.score_ngrams(TrigramAssocMeasures().likelihood_ratio)[:10]
-    allscores = scoredbigrams + scoretrigrams
-    for item in allscores:
-        itemstr = " ".join(i for i in item[0])
-        translation = translate(itemstr).text.lower()
-        score = item[1] / 1000000
-        collocations.append({"0 Term": itemstr, "1 Translation": translation, "2 LogRatio": score})
+        for token in doc:
+            if not token.is_stop:
+                # print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,token.shape_, token.is_alpha, token.is_stop)
+                corpus.append(token.text.lower())
+
+    biagram_collocation = BigramCollocationFinder.from_words(corpus)
 
-    collocations = sorted(collocations, key=lambda x: x["2 LogRatio"], reverse=True)
 
+    scoredbigrams = biagram_collocation.score_ngrams(BigramAssocMeasures().likelihood_ratio)
+    bigramterms = []
+    for j in scoredbigrams:
+        jstr = " ".join(b for b in j[0])
+        bigramterms.append(jstr)
+
+    # scoretrigrams = trigram_collocation.score_ngrams(TrigramAssocMeasures().likelihood_ratio)
+    # allscores = scoredbigrams+scoretrigrams
+    for item in scoredbigrams:
+        itemstr = " ".join(i for i in item[0])
+        if '部' in itemstr:
+            itemstrnew = itemstr
+            translation = translate(itemstr).text.lower()
+            # print(translation)
+            # print('--------------')
+            score = round(item[1], 3)
 
-    terms = [item.get('0 Term') for item in collocations]
+            freq = bigramterms.count(itemstr) / 1000
+            collocations.append({"0 Collocate": itemstrnew, "1 LogRatio": score, "2 Frequency": freq})
 
+    collocationsorted = sorted(collocations, key=lambda x: (x["1 LogRatio"], x["2 Frequency"]), reverse=True)[:10]
 
-    return  terms
+    terms = [d['0 Collocate'] for d in collocationsorted]
+    return terms
 
 
 def run_concordance_on_text(page):
-    page = clean(page)
-    print('Page')
-    print(page)
     nlp = spacy.load('zh_core_web_sm')
-    doc = nlp(page)
-    terms = collocations(doc)
+    conn, cursor = get_db()
+    cursor.execute('SELECT * from news;')
+    res = cursor.fetchall()
+    data = []
+    for row in res:
+        docid = row[0]
+        content = row[-1].replace('\n', ' ').replace('\t', ' ')
+        data.append([docid, content])
+
     concordances = []
-    matcher = PhraseMatcher(nlp.vocab,attr='LOWER')
-    patterns = [nlp.make_doc(term) for term in terms]
-    matcher.add("TermCollocations", patterns)
-
-    matches = matcher(doc)
-    match = Printer()
-    for i, start, end in matches:
-        perecedingSlice = doc[start - 20: start].text
-        if '。' in perecedingSlice:
-            perecedingSlice = perecedingSlice.split('。')[1]
-        else:
-            perecedingSlice = perecedingSlice.strip()
-
-
-        #perecedingSliceTr = clean(translate(doc[start - 20: start]).text)
-        matchedTerm = doc[start:end].text
-        print(matchedTerm)
-        #matchedTerm = doc[start:end].text
-        matchedTermTr = match.text(translate(doc[start:end].text).text, color='red', no_print=True)
-        #matchedTermTr = match.text(translate(doc[start:end].text).text)
-        followingSlice = doc[end:end + 20].text
-        #followingSliceTr = clean(translate(doc[end:end + 20]).text)
-
-        #context = perecedingSlice+', '+matchedTerm+', '+followingSlice
-
-        #contextTr = perecedingSliceTr+', '+matchedTermTr+', '+followingSliceTr
-        #concordances.append({"0 Term": escapeAnscii(matchedTerm), "1 Eng": escapeAnscii(matchedTermTr), "2 Context":escapeAnscii(context), "3 Context Eng":escapeAnscii(contextTr)})
-        concordances.append({"0 Preceded By":perecedingSlice,"1 Term": matchedTerm, "2 Followed By": followingSlice})
-
-   
+    terms = collocations()
+    for i in range(0, len(data)):
+        id = data[i][0]
+        txt = data[i][1]
+        doc = nlp(txt)
+
+
+        matcher = PhraseMatcher(nlp.vocab,attr='LOWER')
+        patterns = [nlp.make_doc(term) for term in terms]
+        matcher.add("TermCollocations", patterns)
+
+        matches = matcher(doc)
+        match = Printer()
+        for j, start, end in matches:
+            perecedingSlice = doc[start - 20: start].text
+            if '。' in perecedingSlice:
+                perecedingSlice = perecedingSlice.split('。')[1]
+            else:
+                perecedingSlice = perecedingSlice.strip()
+
+
+            #perecedingSliceTr = clean(translate(doc[start - 20: start]).text)
+            matchedTerm = doc[start:end].text
+            #matchedTerm = doc[start:end].text
+            matchedTermTr = match.text(translate(doc[start:end].text).text, color='red', no_print=True)
+            #matchedTermTr = match.text(translate(doc[start:end].text).text)
+            followingSlice = doc[end:end + 20].text
+            #followingSliceTr = clean(translate(doc[end:end + 20]).text)
+
+            #context = perecedingSlice+', '+matchedTerm+', '+followingSlice
+
+            #contextTr = perecedingSliceTr+', '+matchedTermTr+', '+followingSliceTr
+            #concordances.append({"0 Term": escapeAnscii(matchedTerm), "1 Eng": escapeAnscii(matchedTermTr), "2 Context":escapeAnscii(context), "3 Context Eng":escapeAnscii(contextTr)})
+            concordances.append({"0 Preceded By":perecedingSlice,"1 Term": matchedTerm, "2 Followed By": followingSlice})
+
+
     result = {'output': concordances, 'message': 'Done', 'code': 'SUCCESS'}
 
     return result
 
-#def main():
-#    result = run_concordance_on_text(page)
 
-#main()
 
diff --git a/func/mutlidatasets/multidatasets.py b/func/mutlidatasets/multidatasets.py
index 00c845c..6548a34 100644
--- a/func/mutlidatasets/multidatasets.py
+++ b/func/mutlidatasets/multidatasets.py
@@ -12,7 +12,7 @@ def run_multidatasets():
     data = []
     for row in res:
         print(row)
-        data.append({"0 Id": row[0], "1 Title": row[1], "2 Date": row[4],"3 Content":row[-1]})
+        data.append({"0 Title": row[1], "1 Date": row[4]})
 
 
 
diff --git a/func/ner/ner.py b/func/ner/ner.py
index a9d8977..e783903 100644
--- a/func/ner/ner.py
+++ b/func/ner/ner.py
@@ -2,42 +2,54 @@ import torch
 from ckip_transformers.nlp import CkipWordSegmenter, CkipPosTagger, CkipNerChunker
 from transformers import pipeline
 import pandas as pd
+from db.db_config import get_db
 
-from shared.translate import translate
+#from shared.translate import translate
 
 #page = '尼罗河 是一条流經非洲東部與北部的河流,與中非地區的剛果河、非洲南部的赞比西河以及西非地区的尼日尔河並列非洲最大的四個河流系統。'
 # Perform NER on Text
 def run_ner_on_text(page):
     ner_driver = CkipNerChunker(model="bert-base")
+    conn, cursor = get_db()
+    cursor.execute('SELECT * from news;')
+    res = cursor.fetchall()
+    data = []
+    for row in res:
+        docid = row[0]
+        content = row[-1].replace('\n', ' ').replace('\t', ' ')
+        data.append([docid, content])
 
+    ner_words_with_count = []
+    for i in range(0, len(data)):
+        id = data[i][0]
+        txt = data[i][1]
+        ner = ner_driver([txt])
 
-    ner = ner_driver([page])
+        tags = []
+        for item in ner[0]:
+            word = item.word
+            ner = item.ner
+            tags.append(word+'__'+ner)
 
-    tags = []
-    for item in ner[0]:
-        word = item.word
-        ner = item.ner
-        #idx = item.idx
 
-        tags.append(word+'__'+ner)
 
+        seen_words = []
+        for tag in tags:
+            if tag not in seen_words:
 
-    ner_words_with_count = []
-    seen_words = []
-    for tag in tags:
-        if tag not in seen_words:
+                freq = tags.count(tag) / 1000
+                word = tag.split('__')[0]
+                ner = tag.split('__')[1]
+                #translation = translate(word).text
+                if ner == 'PERSON':
+                    ner_words_with_count.append({"0 Word": word, "1 NER": ner, "2 Frequency": freq})
+            seen_words.append(tag)
 
-            freq = tags.count(tag) / 1000000
-            word = tag.split('__')[0]
-            ner = tag.split('__')[1]
-            translation = translate(word).text
-            ner_words_with_count.append({"0 Word": word, "1 Translation":translation, "2 NER": ner, "3 Frequency": freq})
-        seen_words.append(tag)
 
+        perdoc = sorted(ner_words_with_count, key=lambda x: x["2 Frequency"], reverse=True)[:30]
 
-    data = sorted(ner_words_with_count, key=lambda x: x["3 Frequency"], reverse=True)
 
-    result = {'output': data,'message': 'Done', 'code': 'SUCCESS'}
+    result = {'output': perdoc,'message': 'Done', 'code': 'SUCCESS'}
 
     return result
 
@@ -47,3 +59,7 @@ def run_ner_on_text(page):
 
 
 
+
+
+
+
diff --git a/func/neroverall/neroverall.py b/func/neroverall/neroverall.py
index 3c22352..32f79ab 100644
--- a/func/neroverall/neroverall.py
+++ b/func/neroverall/neroverall.py
@@ -16,39 +16,42 @@ def run_neroverall_on_text(page):
         data.append([docid, content])
 
     ner_with_count = []
+    tags = []
+    ners = []
+    seen_words = []
+    seen_tags = []
+
     for i in range(0,len(data)):
         id = data[i][0]
         txt = data[i][1]
         ner = ner_driver([txt])
-        tags = []
+
         for item in ner[0]:
             word = item.word
             ner = item.ner
             tags.append(word + '__' + ner)
 
-        ners = []
 
-        seen_words = []
-        seen_tags = []
         for tag in tags:
             if tag not in seen_words:
                 ner = tag.split('__')[1].strip()
-
                 ners.append(ner)
 
             seen_words.append(tag)
 
         for n in ners:
             if n not in seen_tags:
-                freq = ners.count(n) / 1000000
-                ner_with_count.append({"0 Doc Id": id, "1 NER": n, "2 Frequency": freq})
+                freq = ners.count(n) / 1000
+                ner_with_count.append({"1 NER": n, "2 Frequency": freq})
 
             seen_tags.append(n)
 
-    #nerall = sorted(ner_with_count, key=lambda x: (x["0 Doc Id"],x["2 Frequency"]), reverse=True)
+    nerall = sorted(ner_with_count, key=lambda x: x["2 Frequency"], reverse=True)
 
-    result = {'output': ner_with_count,'message': 'Done', 'code': 'SUCCESS'}
+
+    result = {'output': nerall,'message': 'Done', 'code': 'SUCCESS'}
     
     return result
 
 
+
diff --git a/func/sentiment/sentiment.py b/func/sentiment/sentiment.py
index 34d4fda..c12a014 100644
--- a/func/sentiment/sentiment.py
+++ b/func/sentiment/sentiment.py
@@ -8,30 +8,53 @@ from transformers import (
 
 from transformers import pipeline
 import re
+from db.db_config import get_db
 
 def zng(paragraph):
     for sent in re.findall(u'[^!?。\.\!\?]+[!?。\.\!\?]?', paragraph, flags=re.U):
         yield sent
 
-#page = '尼罗河 是一条流經非洲東部與北部的河流,與中非地區的剛果河、非洲南部的赞比西河以及西非地区的尼日尔河並列非洲最大的四個河流系統。尼罗河 是一条流經非洲東部與北部的河流,與中非地區的剛果河、非洲南部的赞比西河以及西非地区的尼日尔河並列非洲最大的四個河流系統。'
-def run_sentiment_on_text(page):
 
-    pagesList = list(zng(page))
+def run_sentiment_on_text(page):
+    conn, cursor = get_db()
+    cursor.execute('SELECT * from news;')
+    res = cursor.fetchall()
+    data = []
+    sentiments = []
+    for row in res:
+        docid = row[0]
+        content = row[-1].replace('\n', ' ').replace('\t', ' ')
+        data.append([docid, content])
+
+    allsentences = []
+    for i in range(0, len(data)):
+        txt = data[i][1]
+        pagesList = list(zng(txt))
+        allsentences.append(pagesList)
+
+    allsentences = [x for xs in allsentences for x in xs]
     tokenizer = BertTokenizerFast.from_pretrained('bardsai/finance-sentiment-zh-base')
     model = AutoModelForSequenceClassification.from_pretrained('bardsai/finance-sentiment-zh-base')
     nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
     counts = dict()
-    for p in pagesList:
+    for p in allsentences:
         res = nlp(p)[0]['label']
         counts[res] = counts.get(res, 0) + 1
 
     if 'negative' not in counts.keys():
         counts['negative'] = 0
 
-    sentiments = []
+
     for k in counts.keys():
         sentiments.append({"0 Sentiment": k, "1 Count": counts[k]})
 
+    sentiments = sorted(sentiments, key=lambda x: x["1 Count"], reverse=True)
     result = {'output': sentiments, 'message': 'Done', 'code': 'SUCCESS'}
+
     return result
 
+
+
+
+
+
diff --git a/func/translation/translation.py b/func/translation/translation.py
index 6a03d67..190bd6b 100644
--- a/func/translation/translation.py
+++ b/func/translation/translation.py
@@ -1,26 +1,30 @@
-import torch
-from ckip_transformers.nlp import CkipWordSegmenter, CkipPosTagger, CkipNerChunker
-from transformers import pipeline
-import pandas as pd
-
+import html_to_json
 from shared.translate import translate
 
-#page = '尼罗河 是一条流經非洲東部與北部的河流,與中非地區的剛果河、非洲南部的赞比西河以及西非地区的尼日尔河並列非洲最大的四個河流系統。'
+
 # Perform NER on Text
 def run_translation_on_text(page):
-
-    translation = '<p>Translation</p>'
-    translation = translation + '<span>'
-    translation = translation + translate(page).text
-    translation = translation +'</span>'
-
-    result = {'output': translation,'message': 'Done', 'code': 'SUCCESS'}
-
-    return result
-
-
-
-
-
-
-
+    try:
+        output_json = html_to_json.convert_tables(page)
+        output = output_json[0]
+        translated = []
+
+        for item in output:
+            try:
+                translated_item = {}  # Use a proper dictionary to hold key-value pairs
+                for k, v in item.items():
+                    # Translate text and store in dictionary
+                    translated_item[k] = translate(v).text
+                    #time.sleep(1)  # Throttle API calls to avoid exhausting memory and network resources
+
+                translated.append(translated_item)
+            except Exception as e:
+                # Log or handle translation errors and continue processing other items
+                print(f"Error translating item {item}: {e}")
+
+
+        result = {'output': translated, 'message': 'Done', 'code': 'SUCCESS'}
+        return result
+    except Exception as e:
+        print(f"Error in run_translation_on_text: {e}")
+        return {'output': None, 'message': f'Error: {e}', 'code': 'FAILED'}
\ No newline at end of file
diff --git a/func/usas/usas.py b/func/usas/usas.py
index a5c6a19..bf72130 100644
--- a/func/usas/usas.py
+++ b/func/usas/usas.py
@@ -1,4 +1,5 @@
 import spacy
+from db.db_config import get_db
 
 
 # Perform USAS on Text
@@ -7,7 +8,7 @@ import spacy
 
 def run_usas_on_text(page):
     d = {}
-    with open('/Users/tom/PycharmProjects/cognistance/func/usas/usas_desc.txt') as f:
+    with open('/Users/tom/PycharmProjects/cognistance/func/usas/usas_overall.txt') as f:
         for line in f:
             lineL = line.replace('\n', '').split(' ', 1)
             key = lineL[0].strip()
@@ -22,35 +23,60 @@ def run_usas_on_text(page):
     # Adds the Chinese PyMUSAS rule-based tagger to the main spaCy pipeline
     nlp.add_pipe('pymusas_rule_based_tagger', source=chinese_tagger_pipeline)
 
-    output_doc = nlp(page)
-    #data = []
+    conn, cursor = get_db()
+    cursor.execute('SELECT * from news;')
+    res = cursor.fetchall()
+    data = []
 
+    for row in res:
+        docid = row[0]
+        content = row[-1].replace('\n', ' ').replace('\t', ' ')
+        data.append([docid, content])
+
+    #output_doc = nlp(page)
+    usas_tags_with_count = []
     tags = []
+    seen_tags = []
+    for i in range(0, len(data)):
+        id = data[i][0]
+        txt = data[i][1]
+        output_doc = nlp(txt)
 
-    for token in output_doc:
-        start, end = token._.pymusas_mwe_indexes[0]
-        idx = (start, end)
 
-        for el in token._.pymusas_tags:
-            el = el.split('.')[0]
-            #obj = {"word": token.text, "USAS Tags": el, "idx": idx}
-            tags.append(el)
-            #data.append(obj)
 
-    usas_tags_with_count = []
-    seen_tags = []
-    for tag in tags:
-        if tag not in seen_tags:
-            try:
-                freq = tags.count(tag)/1000000
-                usas_tags_with_count.append({"0 Tag": tag, "1 Definition":d[tag],"2 Frequency": freq})
-            except KeyError:
-                pass
-        seen_tags.append(tag)
-
-    usas_tags_with_count = sorted(usas_tags_with_count, key=lambda x: x["2 Frequency"], reverse=True)
-    result = {'output': usas_tags_with_count, 'message': 'Done', 'code': 'SUCCESS'}
+        for token in output_doc:
+            start, end = token._.pymusas_mwe_indexes[0]
+            idx = (start, end)
+
+            for el in token._.pymusas_tags:
+                el = el.split('.')[0][0]
+                #obj = {"word": token.text, "USAS Tags": el, "idx": idx}
+                tags.append(el)
+                #data.append(obj)
+
+
+
+        for tag in tags:
+            if tag not in seen_tags:
+                try:
+                    freq = tags.count(tag)/1000
+
+                    usas_tags_with_count.append({"0 Tag": tag, "1 Definition":d[tag],"2 Frequency": freq})
+                except KeyError:
+                    pass
+            seen_tags.append(tag)
+
+    usas_tags_with_count_sorted = sorted(usas_tags_with_count, key=lambda x: x["2 Frequency"], reverse=True)
+
+
+    result = {'output': usas_tags_with_count_sorted, 'message': 'Done', 'code': 'SUCCESS'}
 
     return result
 
 
+
+
+
+
+
+
diff --git a/func/usasFine/usasFine.py b/func/usasFine/usasFine.py
new file mode 100644
index 0000000..5255519
--- /dev/null
+++ b/func/usasFine/usasFine.py
@@ -0,0 +1,85 @@
+import spacy
+from db.db_config import get_db
+
+
+# Perform USAS on Text
+#page = '尼罗河 是一条流經非洲東部與北部的河流,與中非地區的剛果河、非洲南部的赞比西河以及西非地区的尼日尔河並列非洲最大的四個河流系統。'
+
+
+def run_usasFine_on_text(page):
+    d = {}
+    with open('/Users/tom/PycharmProjects/cognistance/func/usas/usas_desc.txt') as f:
+        for line in f:
+            lineL = line.replace('\n', '').split(' ', 1)
+            key = lineL[0].strip()
+            val = lineL[1].strip()
+            d[key] = val
+
+    print(d)
+    # We exclude the following components as we do not need them.
+    nlp = spacy.load('zh_core_web_sm', exclude=['parser', 'ner'])
+    # Load the Chinese PyMUSAS rule-based tagger in a separate spaCy pipeline
+    chinese_tagger_pipeline = spacy.load('cmn_dual_upos2usas_contextual')
+    # Adds the Chinese PyMUSAS rule-based tagger to the main spaCy pipeline
+    nlp.add_pipe('pymusas_rule_based_tagger', source=chinese_tagger_pipeline)
+
+    conn, cursor = get_db()
+    cursor.execute('SELECT * from news;')
+    res = cursor.fetchall()
+    data = []
+
+    for row in res:
+        docid = row[0]
+        content = row[-1].replace('\n', ' ').replace('\t', ' ')
+        data.append([docid, content])
+
+    #output_doc = nlp(page)
+    usas_tags_with_count = []
+    tags = []
+    seen_tags = []
+    for i in range(0, len(data)):
+        id = data[i][0]
+        txt = data[i][1]
+        output_doc = nlp(txt)
+
+
+
+        for token in output_doc:
+            start, end = token._.pymusas_mwe_indexes[0]
+            idx = (start, end)
+
+            for el in token._.pymusas_tags:
+                el = el.split('.')[0]
+                #obj = {"word": token.text, "USAS Tags": el, "idx": idx}
+                #tags.append(el)
+                word = token.text
+                tags.append(word + '__' + el)
+                #data.append(obj)
+
+
+
+        for tag in tags:
+            if tag not in seen_tags:
+                try:
+                    freq = tags.count(tag)/1000
+                    word = tag.split('__')[0]
+                    usas = tag.split('__')[1]
+
+                    if 'A' in usas:
+                        tag_object = {"0 Word":word,"1 Discourse Field": usas, "2 Definition":d[usas],"3 Frequency": freq}
+
+                        usas_tags_with_count.append(tag_object)
+
+                except KeyError:
+                    pass
+            seen_tags.append(tag)
+
+    usas_tags_with_count_sorted = sorted(usas_tags_with_count, key=lambda x: x["3 Frequency"], reverse=False)[:30]
+    result = {'output': usas_tags_with_count_sorted, 'message': 'Done', 'code': 'SUCCESS'}
+
+    return result
+
+
+
+
+
diff --git a/main.py b/main.py
index 422a801..ea0ba98 100644
--- a/main.py
+++ b/main.py
@@ -45,6 +45,14 @@ def usas():
 
     return result
 
+@app.route("/usasFine", methods=['POST'])
+def usasFine():
+    request_data = request.get_json()
+    page = request_data['page']
+    result = get_usasFine_for_data(page)
+
+    return result
+
 @app.route("/sentiment", methods=['POST'])
 def sentiment():
 
-- 
GitLab