From f23a52a75b5f0118f56351dc26e062f645556592 Mon Sep 17 00:00:00 2001
From: Tom Edwards <edwardstj1@cardiff.ac.uk>
Date: Tue, 17 Dec 2024 04:30:49 +0000
Subject: [PATCH] demo

---
 api/api_functions.py                |   9 ++
 func/collocation/collocation.py     |  74 +++++++-----
 func/concordance/concordance.py     | 167 +++++++++++++++-------------
 func/mutlidatasets/multidatasets.py |   2 +-
 func/ner/ner.py                     |  56 ++++++----
 func/neroverall/neroverall.py       |  21 ++--
 func/sentiment/sentiment.py         |  33 +++++-
 func/translation/translation.py     |  48 ++++----
 func/usas/usas.py                   |  74 ++++++++----
 func/usasFine/usasFine.py           |  85 ++++++++++++++
 main.py                             |   8 ++
 11 files changed, 387 insertions(+), 190 deletions(-)
 create mode 100644 func/usasFine/usasFine.py

diff --git a/api/api_functions.py b/api/api_functions.py
index 61afca1..d4d5cd1 100644
--- a/api/api_functions.py
+++ b/api/api_functions.py
@@ -8,6 +8,7 @@ from func.collocation.collocation import *
 from func.concordance.concordance import *
 from func.mutlidatasets.multidatasets import *
 from func.neroverall.neroverall import *
+from func.usasFine.usasFine import *
 
 # Perform NER on a file
 # TAKES XML text page
@@ -44,6 +45,14 @@ def get_usas_for_data(page):
 
     return make_response(jsonify(result), 400)
 
+def get_usasFine_for_data(page):
+    result = run_usasFine_on_text(page)
+
+    if result["code"] == "SUCCESS":
+        return make_response(jsonify(result), 201)
+
+    return make_response(jsonify(result), 400)
+
 
 # Perform Sentiment analysis on a file
 # TAKES XML text page
diff --git a/func/collocation/collocation.py b/func/collocation/collocation.py
index 64396ea..6ca9db5 100644
--- a/func/collocation/collocation.py
+++ b/func/collocation/collocation.py
@@ -1,65 +1,79 @@
 import spacy
 import math
 from shared.translate import translate
+import nltk
 from nltk.collocations import TrigramCollocationFinder
 from nltk.metrics import TrigramAssocMeasures
 from nltk.collocations import BigramCollocationFinder
 from nltk.metrics import BigramAssocMeasures
+from db.db_config import get_db
 
 
-#page = 'ä¸“ç²¾ç‰¹æ–°â€ä¼ä¸šï¼Œæ˜¯æŒ‡å…·æœ‰ä¸“ä¸šåŒ–ã€ç²¾ç»†åŒ–ã€ç‰¹è‰²åŒ–ã€æ–°é¢–åŒ–å››å¤§ç‰¹å¾çš„ä¸å°ä¼ä¸šã€‚åˆ›æ–°æ˜¯è¿™ç±»ä¼ä¸šçš„çµé‚ï¼Œè¶³å¤Ÿçš„ ç ”å‘è´¹ç”¨æŠ•å…¥åˆ™æ˜¯å¼€å±•åˆ›æ–°çš„é‡è¦ä¿éšœã€‚è®¸å¤šå°šå¤„åœ¨æˆé•¿æœŸçš„â€œä¸“ç²¾ç‰¹æ–°â€ä¼ä¸šï¼Œè¿‘æœŸæ™®ééé‡â€œé’±ç´§â€éš¾é¢˜ã€‚å¦‚ä½•é›†èš æ›´å¤šçš„èµ„é‡‘æŠ•å…¥ç ”å‘ã€ä¿æŒåˆ›æ–°é¢†å…ˆåœ°ä½æ˜¯è¿™äº›ä¼ä¸šè¿‘æ¥é¢ä¸´çš„æœ€å¤§çƒ¦æ¼ã€‚â€œä½œä¸ºä¸€å®¶æ–°ææ–™ç ”å‘å…¬å¸ï¼Œåˆ›æ–°æ˜¯æˆ‘ä»¬å‘å±•çš„é‡è¦é©±åŠ¨åŠ›ï¼Œåªæœ‰ç ”å‘æŠ•å…¥çš„ä¸æ–åŠ ç ï¼Œä¼ä¸šåˆ›æ–°å‘å±•çš„æ¥ä¼æ‰ä¸ä¼š é™é€Ÿã€‚â€æµ™æ±Ÿçœâ€œä¸“ç²¾ç‰¹æ–°â€ä¼ä¸šã€å®æ³¢åˆ›æ¶¦æ–°ææ–™æœ‰é™å…¬å¸è‘£äº‹é•¿å´æ™¯æ™–è¯´ï¼Œè¿‡åŽ»3å¹´ï¼Œä¼ä¸šåœ¨ç ”å‘æŠ•å…¥æ–¹é¢ä¸é—ä½™åŠ›ï¼Œç´¯è®¡æŠ•å…¥2500ä¸‡å…ƒï¼Œè¿™å¯¹ä¼ä¸šæ¥è¯´ä¸æ˜¯ä¸ªå°æ•°ç›®ã€‚ ä»Šå¹´æ–°å…´å¸‚åœºçš„ç ”å‘éœ€æ±‚ååˆ†è¿«åˆ‡ï¼Œæˆ‘ä»¬ä¸€ç›´æƒ³åŠ å¿« è¶…é«˜çº¯é’›åŠé’›åˆé‡‘ä¸è¯•ç”Ÿäº§çº¿é¡¹ç›® çš„ç ”å‘è¿›åº¦ï¼Œä½†è‹¦äºŽèµ„é‡‘ä¸ è¶³ã€‚ä»¤äººé«˜å…´çš„æ˜¯ï¼Œä»Šå¹´4æœˆ340ä¸‡å…ƒå˜é‡å¢žå€¼ç¨Žç•™æŠµç¨Žé¢çš„åˆ°è´¦ï¼Œæœ‰æ•ˆç¼“è§£äº†ä¼ä¸šçš„èµ„é‡‘åŽ‹åŠ›ï¼ŒåŠ å¿«äº†ä¼ä¸šçš„ç ”å‘ è¿›åº¦ã€‚â€å´æ™¯æ™–è¯´ï¼Œç›®å‰ï¼Œâ€œè¶…é«˜çº¯é’›åŠé’›åˆé‡‘ä¸è¯•ç”Ÿäº§çº¿é¡¹ç›®â€æ£åœ¨æœ‰åºæŽ¨è¿›ï¼Œä¸€æ—¦æŠ•äº§å°†ç¼“è§£åŠå¯¼ä½“äº§ä¸šçš„é«˜çº¯é’›åŽŸææ–™ä¾›åº”ä¸è¶³é—®é¢˜ï¼Œæå‡å›½äº§æº…å°„é¶æçš„å¸‚åœºç«žäº‰åŠ›'
+def run_collocation_on_text(page):
 
+    collocations = []
 
+    nlp = spacy.load('zh_core_web_sm')
+    conn, cursor = get_db()
+    cursor.execute('SELECT * from news;')
+    res = cursor.fetchall()
 
+    data = []
 
-def clean(text):
+    for row in res:
 
-    text = text.replace('<p>', ' ')
-    text = text.replace('</p>', ' ')
-    text = text.replace('<br>', ' ')
-    text = text.replace('</br>', ' ')
-    text = text.replace('><', ' ')
-    text = text.replace('\u3000', ' ')
-    text = text.replace('br', ' ')
-    cltext = text.replace('\n', ' ').strip()
-    return str(cltext)
+        docid = row[0]
 
-def run_collocation_on_text(page):
+        content = row[-1].replace('\n', ' ').replace('\t', ' ')
 
-    page = clean(page)
+        data.append([docid, content])
 
     corpus = []
-    collocations = []
+    for i in range(0, len(data)):
+        id = data[i][0]
+        txt = data[i][1]
+        doc = nlp(txt)
+
 
-    nlp = spacy.load('zh_core_web_sm')
-    doc = nlp(page)
 
-    for token in doc:
-        if not token.is_stop:
-            # print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,token.shape_, token.is_alpha, token.is_stop)
-            corpus.append(token.text.lower())
+        for token in doc:
+            if not token.is_stop:
+                # print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,token.shape_, token.is_alpha, token.is_stop)
+                corpus.append(token.text.lower())
 
 
     biagram_collocation = BigramCollocationFinder.from_words(corpus)
     #biagram_collocation.apply_freq_filter(3)
-    trigram_collocation = TrigramCollocationFinder.from_words(corpus)
+    #trigram_collocation = TrigramCollocationFinder.from_words(corpus)
     #trigram_collocation.apply_freq_filter(3)
 
-    scoredbigrams = biagram_collocation.score_ngrams(BigramAssocMeasures().likelihood_ratio)[:10]
 
-    scoretrigrams = trigram_collocation.score_ngrams(TrigramAssocMeasures().likelihood_ratio)[:10]
-    allscores = scoredbigrams+scoretrigrams
-    for item in allscores:
+    scoredbigrams = biagram_collocation.score_ngrams(BigramAssocMeasures().likelihood_ratio)
+    bigramterms = []
+    for j in scoredbigrams:
+        jstr = " ".join(b for b in j[0])
+        bigramterms.append(jstr)
+
+    #scoretrigrams = trigram_collocation.score_ngrams(TrigramAssocMeasures().likelihood_ratio)
+    #allscores = scoredbigrams+scoretrigrams
+    for item in scoredbigrams:
         itemstr = " ".join(i for i in item[0])
-        translation = translate(itemstr).text.lower()
-        score = item[1]/1000000
-        collocations.append({"0 Term": itemstr,"1 Translation":translation ,"2 LogRatio": score})
+        if 'éƒ¨' in itemstr:
+            itemstrnew = itemstr.replace('éƒ¨','').strip().replace(' ','')
+            translation = translate(itemstr.replace('éƒ¨','').strip()).text.lower()
+            #print(translation)
+            #print('--------------')
+            score = round(item[1],3)
 
+            freq = bigramterms.count(itemstr)/ 1000
+            collocations.append({"0 Collocate": itemstrnew , "1 LogRatio": score, "2 Frequency":freq})
 
-    collocations = sorted(collocations, key=lambda x: x["2 LogRatio"], reverse=True)
 
+    collocationsorted = sorted(collocations, key=lambda x: (x["1 LogRatio"],x["2 Frequency"]), reverse=True)[:10]
 
-    result = {'output': collocations, 'message': 'Done', 'code': 'SUCCESS'}
+    result = {'output': collocationsorted, 'message': 'Done', 'code': 'SUCCESS'}
     return result
 
 
+
+
diff --git a/func/concordance/concordance.py b/func/concordance/concordance.py
index c716d20..b9fe3fd 100644
--- a/func/concordance/concordance.py
+++ b/func/concordance/concordance.py
@@ -1,111 +1,120 @@
 import spacy
-import math
-from collections import Counter, defaultdict
 from shared.translate import translate
 from wasabi import Printer
 from spacy.matcher import PhraseMatcher
-import re
-from nltk.collocations import TrigramCollocationFinder
-from nltk.metrics import TrigramAssocMeasures
+from db.db_config import get_db
 from nltk.collocations import BigramCollocationFinder
 from nltk.metrics import BigramAssocMeasures
 
-#page = 'ä¸“ç²¾ç‰¹æ–°â€ä¼ä¸šï¼Œæ˜¯æŒ‡å…·æœ‰ä¸“ä¸šåŒ–ã€ç²¾ç»†åŒ–ã€ç‰¹è‰²åŒ–ã€æ–°é¢–åŒ–å››å¤§ç‰¹å¾çš„ä¸å°ä¼ä¸šã€‚åˆ›æ–°æ˜¯è¿™ç±»ä¼ä¸šçš„çµé‚ï¼Œè¶³å¤Ÿçš„ ç ”å‘è´¹ç”¨æŠ•å…¥åˆ™æ˜¯å¼€å±•åˆ›æ–°çš„é‡è¦ä¿éšœã€‚è®¸å¤šå°šå¤„åœ¨æˆé•¿æœŸçš„â€œä¸“ç²¾ç‰¹æ–°â€ä¼ä¸šï¼Œè¿‘æœŸæ™®ééé‡â€œé’±ç´§â€éš¾é¢˜ã€‚å¦‚ä½•é›†èš æ›´å¤šçš„èµ„é‡‘æŠ•å…¥ç ”å‘ã€ä¿æŒåˆ›æ–°é¢†å…ˆåœ°ä½æ˜¯è¿™äº›ä¼ä¸šè¿‘æ¥é¢ä¸´çš„æœ€å¤§çƒ¦æ¼ã€‚â€œä½œä¸ºä¸€å®¶æ–°ææ–™ç ”å‘å…¬å¸ï¼Œåˆ›æ–°æ˜¯æˆ‘ä»¬å‘å±•çš„é‡è¦é©±åŠ¨åŠ›ï¼Œåªæœ‰ç ”å‘æŠ•å…¥çš„ä¸æ–åŠ ç ï¼Œä¼ä¸šåˆ›æ–°å‘å±•çš„æ¥ä¼æ‰ä¸ä¼š é™é€Ÿã€‚â€æµ™æ±Ÿçœâ€œä¸“ç²¾ç‰¹æ–°â€ä¼ä¸šã€å®æ³¢åˆ›æ¶¦æ–°ææ–™æœ‰é™å…¬å¸è‘£äº‹é•¿å´æ™¯æ™–è¯´ï¼Œè¿‡åŽ»3å¹´ï¼Œä¼ä¸šåœ¨ç ”å‘æŠ•å…¥æ–¹é¢ä¸é—ä½™åŠ›ï¼Œç´¯è®¡æŠ•å…¥2500ä¸‡å…ƒï¼Œè¿™å¯¹ä¼ä¸šæ¥è¯´ä¸æ˜¯ä¸ªå°æ•°ç›®ã€‚ ä»Šå¹´æ–°å…´å¸‚åœºçš„ç ”å‘éœ€æ±‚ååˆ†è¿«åˆ‡ï¼Œæˆ‘ä»¬ä¸€ç›´æƒ³åŠ å¿« è¶…é«˜çº¯é’›åŠé’›åˆé‡‘ä¸è¯•ç”Ÿäº§çº¿é¡¹ç›® çš„ç ”å‘è¿›åº¦ï¼Œä½†è‹¦äºŽèµ„é‡‘ä¸ è¶³ã€‚ä»¤äººé«˜å…´çš„æ˜¯ï¼Œä»Šå¹´4æœˆ340ä¸‡å…ƒå˜é‡å¢žå€¼ç¨Žç•™æŠµç¨Žé¢çš„åˆ°è´¦ï¼Œæœ‰æ•ˆç¼“è§£äº†ä¼ä¸šçš„èµ„é‡‘åŽ‹åŠ›ï¼ŒåŠ å¿«äº†ä¼ä¸šçš„ç ”å‘ è¿›åº¦ã€‚â€å´æ™¯æ™–è¯´ï¼Œç›®å‰ï¼Œâ€œè¶…é«˜çº¯é’›åŠé’›åˆé‡‘ä¸è¯•ç”Ÿäº§çº¿é¡¹ç›®â€æ£åœ¨æœ‰åºæŽ¨è¿›ï¼Œä¸€æ—¦æŠ•äº§å°†ç¼“è§£åŠå¯¼ä½“äº§ä¸šçš„é«˜çº¯é’›åŽŸææ–™ä¾›åº”ä¸è¶³é—®é¢˜ï¼Œæå‡å›½äº§æº…å°„é¶æçš„å¸‚åœºç«žäº‰åŠ›'
 
 
-def clean(text):
+def collocations():
+    collocations = []
 
-    text = text.replace('<p>', ' ')
-    text = text.replace('</p>', ' ')
-    text = text.replace('<br>', ' ')
-    text = text.replace('</br>', ' ')
-    text = text.replace('><', ' ')
-    text = text.replace('\u3000', ' ')
-    text = text.replace('br', ' ')
-    text = text.replace('â€”â€”', '')
-    text = text.replace('[38;5;1m', '')
-    text = text.replace('[0m','')
-    cltext = text.replace('\n', ' ').strip()
-    return str(cltext)
+    nlp = spacy.load('zh_core_web_sm')
+    conn, cursor = get_db()
+    cursor.execute('SELECT * from news;')
+    res = cursor.fetchall()
 
-def collocations(doc):
-    corpus = []
-    collocations = []
+    data = []
 
-    for token in doc:
+    for row in res:
+        docid = row[0]
 
-        if not token.is_stop:
-            # print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,token.shape_, token.is_alpha, token.is_stop)
-            corpus.append(token.text.lower())
+        content = row[-1].replace('\n', ' ').replace('\t', ' ')
 
-    biagram_collocation = BigramCollocationFinder.from_words(corpus)
-    # biagram_collocation.apply_freq_filter(3)
-    trigram_collocation = TrigramCollocationFinder.from_words(corpus)
-    # trigram_collocation.apply_freq_filter(3)
+        data.append([docid, content])
 
-    scoredbigrams = biagram_collocation.score_ngrams(BigramAssocMeasures().likelihood_ratio)[:10]
+    corpus = []
+    for i in range(0, len(data)):
+        txt = data[i][1]
+        doc = nlp(txt)
 
-    scoretrigrams = trigram_collocation.score_ngrams(TrigramAssocMeasures().likelihood_ratio)[:10]
-    allscores = scoredbigrams + scoretrigrams
-    for item in allscores:
-        itemstr = " ".join(i for i in item[0])
-        translation = translate(itemstr).text.lower()
-        score = item[1] / 1000000
-        collocations.append({"0 Term": itemstr, "1 Translation": translation, "2 LogRatio": score})
+        for token in doc:
+            if not token.is_stop:
+                # print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,token.shape_, token.is_alpha, token.is_stop)
+                corpus.append(token.text.lower())
+
+    biagram_collocation = BigramCollocationFinder.from_words(corpus)
 
-    collocations = sorted(collocations, key=lambda x: x["2 LogRatio"], reverse=True)
 
+    scoredbigrams = biagram_collocation.score_ngrams(BigramAssocMeasures().likelihood_ratio)
+    bigramterms = []
+    for j in scoredbigrams:
+        jstr = " ".join(b for b in j[0])
+        bigramterms.append(jstr)
+
+    # scoretrigrams = trigram_collocation.score_ngrams(TrigramAssocMeasures().likelihood_ratio)
+    # allscores = scoredbigrams+scoretrigrams
+    for item in scoredbigrams:
+        itemstr = " ".join(i for i in item[0])
+        if 'éƒ¨' in itemstr:
+            itemstrnew = itemstr
+            translation = translate(itemstr).text.lower()
+            # print(translation)
+            # print('--------------')
+            score = round(item[1], 3)
 
-    terms = [item.get('0 Term') for item in collocations]
+            freq = bigramterms.count(itemstr) / 1000
+            collocations.append({"0 Collocate": itemstrnew, "1 LogRatio": score, "2 Frequency": freq})
 
+    collocationsorted = sorted(collocations, key=lambda x: (x["1 LogRatio"], x["2 Frequency"]), reverse=True)[:10]
 
-    return  terms
+    terms = [d['0 Collocate'] for d in collocationsorted]
+    return terms
 
 
 def run_concordance_on_text(page):
-    page = clean(page)
-    print('Page')
-    print(page)
     nlp = spacy.load('zh_core_web_sm')
-    doc = nlp(page)
-    terms = collocations(doc)
+    conn, cursor = get_db()
+    cursor.execute('SELECT * from news;')
+    res = cursor.fetchall()
+    data = []
+    for row in res:
+        docid = row[0]
+        content = row[-1].replace('\n', ' ').replace('\t', ' ')
+        data.append([docid, content])
+
     concordances = []
-    matcher = PhraseMatcher(nlp.vocab,attr='LOWER')
-    patterns = [nlp.make_doc(term) for term in terms]
-    matcher.add("TermCollocations", patterns)
-
-    matches = matcher(doc)
-    match = Printer()
-    for i, start, end in matches:
-        perecedingSlice = doc[start - 20: start].text
-        if 'ã€‚' in perecedingSlice:
-            perecedingSlice = perecedingSlice.split('ã€‚')[1]
-        else:
-            perecedingSlice = perecedingSlice.strip()
-
-
-        #perecedingSliceTr = clean(translate(doc[start - 20: start]).text)
-        matchedTerm = doc[start:end].text
-        print(matchedTerm)
-        #matchedTerm = doc[start:end].text
-        matchedTermTr = match.text(translate(doc[start:end].text).text, color='red', no_print=True)
-        #matchedTermTr = match.text(translate(doc[start:end].text).text)
-        followingSlice = doc[end:end + 20].text
-        #followingSliceTr = clean(translate(doc[end:end + 20]).text)
-
-        #context = perecedingSlice+', '+matchedTerm+', '+followingSlice
-
-        #contextTr = perecedingSliceTr+', '+matchedTermTr+', '+followingSliceTr
-        #concordances.append({"0 Term": escapeAnscii(matchedTerm), "1 Eng": escapeAnscii(matchedTermTr), "2 Context":escapeAnscii(context), "3 Context Eng":escapeAnscii(contextTr)})
-        concordances.append({"0 Preceded By":perecedingSlice,"1 Term": matchedTerm, "2 Followed By": followingSlice})
-
-   
+    terms = collocations()
+    for i in range(0, len(data)):
+        id = data[i][0]
+        txt = data[i][1]
+        doc = nlp(txt)
+
+
+        matcher = PhraseMatcher(nlp.vocab,attr='LOWER')
+        patterns = [nlp.make_doc(term) for term in terms]
+        matcher.add("TermCollocations", patterns)
+
+        matches = matcher(doc)
+        match = Printer()
+        for j, start, end in matches:
+            perecedingSlice = doc[start - 20: start].text
+            if 'ã€‚' in perecedingSlice:
+                perecedingSlice = perecedingSlice.split('ã€‚')[1]
+            else:
+                perecedingSlice = perecedingSlice.strip()
+
+
+            #perecedingSliceTr = clean(translate(doc[start - 20: start]).text)
+            matchedTerm = doc[start:end].text
+            #matchedTerm = doc[start:end].text
+            matchedTermTr = match.text(translate(doc[start:end].text).text, color='red', no_print=True)
+            #matchedTermTr = match.text(translate(doc[start:end].text).text)
+            followingSlice = doc[end:end + 20].text
+            #followingSliceTr = clean(translate(doc[end:end + 20]).text)
+
+            #context = perecedingSlice+', '+matchedTerm+', '+followingSlice
+
+            #contextTr = perecedingSliceTr+', '+matchedTermTr+', '+followingSliceTr
+            #concordances.append({"0 Term": escapeAnscii(matchedTerm), "1 Eng": escapeAnscii(matchedTermTr), "2 Context":escapeAnscii(context), "3 Context Eng":escapeAnscii(contextTr)})
+            concordances.append({"0 Preceded By":perecedingSlice,"1 Term": matchedTerm, "2 Followed By": followingSlice})
+
+
     result = {'output': concordances, 'message': 'Done', 'code': 'SUCCESS'}
 
     return result
 
-#def main():
-#    result = run_concordance_on_text(page)
 
-#main()
 
diff --git a/func/mutlidatasets/multidatasets.py b/func/mutlidatasets/multidatasets.py
index 00c845c..6548a34 100644
--- a/func/mutlidatasets/multidatasets.py
+++ b/func/mutlidatasets/multidatasets.py
@@ -12,7 +12,7 @@ def run_multidatasets():
     data = []
     for row in res:
         print(row)
-        data.append({"0 Id": row[0], "1 Title": row[1], "2 Date": row[4],"3 Content":row[-1]})
+        data.append({"0 Title": row[1], "1 Date": row[4]})
 
 
 
diff --git a/func/ner/ner.py b/func/ner/ner.py
index a9d8977..e783903 100644
--- a/func/ner/ner.py
+++ b/func/ner/ner.py
@@ -2,42 +2,54 @@ import torch
 from ckip_transformers.nlp import CkipWordSegmenter, CkipPosTagger, CkipNerChunker
 from transformers import pipeline
 import pandas as pd
+from db.db_config import get_db
 
-from shared.translate import translate
+#from shared.translate import translate
 
 #page = 'å°¼ç½—æ²³ æ˜¯ä¸€æ¡æµç¶“éžæ´²æ±éƒ¨èˆ‡åŒ—éƒ¨çš„æ²³æµï¼Œèˆ‡ä¸éžåœ°å€çš„å‰›æžœæ²³ã€éžæ´²å—éƒ¨çš„èµžæ¯”è¥¿æ²³ä»¥åŠè¥¿éžåœ°åŒºçš„å°¼æ—¥å°”æ²³ä¸¦åˆ—éžæ´²æœ€å¤§çš„å››å€‹æ²³æµç³»çµ±ã€‚'
 # Perform NER on Text
 def run_ner_on_text(page):
     ner_driver = CkipNerChunker(model="bert-base")
+    conn, cursor = get_db()
+    cursor.execute('SELECT * from news;')
+    res = cursor.fetchall()
+    data = []
+    for row in res:
+        docid = row[0]
+        content = row[-1].replace('\n', ' ').replace('\t', ' ')
+        data.append([docid, content])
 
+    ner_words_with_count = []
+    for i in range(0, len(data)):
+        id = data[i][0]
+        txt = data[i][1]
+        ner = ner_driver([txt])
 
-    ner = ner_driver([page])
+        tags = []
+        for item in ner[0]:
+            word = item.word
+            ner = item.ner
+            tags.append(word+'__'+ner)
 
-    tags = []
-    for item in ner[0]:
-        word = item.word
-        ner = item.ner
-        #idx = item.idx
 
-        tags.append(word+'__'+ner)
 
+        seen_words = []
+        for tag in tags:
+            if tag not in seen_words:
 
-    ner_words_with_count = []
-    seen_words = []
-    for tag in tags:
-        if tag not in seen_words:
+                freq = tags.count(tag) / 1000
+                word = tag.split('__')[0]
+                ner = tag.split('__')[1]
+                #translation = translate(word).text
+                if ner == 'PERSON':
+                    ner_words_with_count.append({"0 Word": word, "1 NER": ner, "2 Frequency": freq})
+            seen_words.append(tag)
 
-            freq = tags.count(tag) / 1000000
-            word = tag.split('__')[0]
-            ner = tag.split('__')[1]
-            translation = translate(word).text
-            ner_words_with_count.append({"0 Word": word, "1 Translation":translation, "2 NER": ner, "3 Frequency": freq})
-        seen_words.append(tag)
 
+        perdoc = sorted(ner_words_with_count, key=lambda x: x["2 Frequency"], reverse=True)[:30]
 
-    data = sorted(ner_words_with_count, key=lambda x: x["3 Frequency"], reverse=True)
 
-    result = {'output': data,'message': 'Done', 'code': 'SUCCESS'}
+    result = {'output': perdoc,'message': 'Done', 'code': 'SUCCESS'}
 
     return result
 
@@ -47,3 +59,7 @@ def run_ner_on_text(page):
 
 
 
+
+
+
+
diff --git a/func/neroverall/neroverall.py b/func/neroverall/neroverall.py
index 3c22352..32f79ab 100644
--- a/func/neroverall/neroverall.py
+++ b/func/neroverall/neroverall.py
@@ -16,39 +16,42 @@ def run_neroverall_on_text(page):
         data.append([docid, content])
 
     ner_with_count = []
+    tags = []
+    ners = []
+    seen_words = []
+    seen_tags = []
+
     for i in range(0,len(data)):
         id = data[i][0]
         txt = data[i][1]
         ner = ner_driver([txt])
-        tags = []
+
         for item in ner[0]:
             word = item.word
             ner = item.ner
             tags.append(word + '__' + ner)
 
-        ners = []
 
-        seen_words = []
-        seen_tags = []
         for tag in tags:
             if tag not in seen_words:
                 ner = tag.split('__')[1].strip()
-
                 ners.append(ner)
 
             seen_words.append(tag)
 
         for n in ners:
             if n not in seen_tags:
-                freq = ners.count(n) / 1000000
-                ner_with_count.append({"0 Doc Id": id, "1 NER": n, "2 Frequency": freq})
+                freq = ners.count(n) / 1000
+                ner_with_count.append({"1 NER": n, "2 Frequency": freq})
 
             seen_tags.append(n)
 
-    #nerall = sorted(ner_with_count, key=lambda x: (x["0 Doc Id"],x["2 Frequency"]), reverse=True)
+    nerall = sorted(ner_with_count, key=lambda x: x["2 Frequency"], reverse=True)
 
-    result = {'output': ner_with_count,'message': 'Done', 'code': 'SUCCESS'}
+
+    result = {'output': nerall,'message': 'Done', 'code': 'SUCCESS'}
     
     return result
 
 
+
diff --git a/func/sentiment/sentiment.py b/func/sentiment/sentiment.py
index 34d4fda..c12a014 100644
--- a/func/sentiment/sentiment.py
+++ b/func/sentiment/sentiment.py
@@ -8,30 +8,53 @@ from transformers import (
 
 from transformers import pipeline
 import re
+from db.db_config import get_db
 
 def zng(paragraph):
     for sent in re.findall(u'[^!?ã€‚\.\!\?]+[!?ã€‚\.\!\?]?', paragraph, flags=re.U):
         yield sent
 
-#page = 'å°¼ç½—æ²³ æ˜¯ä¸€æ¡æµç¶“éžæ´²æ±éƒ¨èˆ‡åŒ—éƒ¨çš„æ²³æµï¼Œèˆ‡ä¸éžåœ°å€çš„å‰›æžœæ²³ã€éžæ´²å—éƒ¨çš„èµžæ¯”è¥¿æ²³ä»¥åŠè¥¿éžåœ°åŒºçš„å°¼æ—¥å°”æ²³ä¸¦åˆ—éžæ´²æœ€å¤§çš„å››å€‹æ²³æµç³»çµ±ã€‚å°¼ç½—æ²³ æ˜¯ä¸€æ¡æµç¶“éžæ´²æ±éƒ¨èˆ‡åŒ—éƒ¨çš„æ²³æµï¼Œèˆ‡ä¸éžåœ°å€çš„å‰›æžœæ²³ã€éžæ´²å—éƒ¨çš„èµžæ¯”è¥¿æ²³ä»¥åŠè¥¿éžåœ°åŒºçš„å°¼æ—¥å°”æ²³ä¸¦åˆ—éžæ´²æœ€å¤§çš„å››å€‹æ²³æµç³»çµ±ã€‚'
-def run_sentiment_on_text(page):
 
-    pagesList = list(zng(page))
+def run_sentiment_on_text(page):
+    conn, cursor = get_db()
+    cursor.execute('SELECT * from news;')
+    res = cursor.fetchall()
+    data = []
+    sentiments = []
+    for row in res:
+        docid = row[0]
+        content = row[-1].replace('\n', ' ').replace('\t', ' ')
+        data.append([docid, content])
+
+    allsentences = []
+    for i in range(0, len(data)):
+        txt = data[i][1]
+        pagesList = list(zng(txt))
+        allsentences.append(pagesList)
+
+    allsentences = [x for xs in allsentences for x in xs]
     tokenizer = BertTokenizerFast.from_pretrained('bardsai/finance-sentiment-zh-base')
     model = AutoModelForSequenceClassification.from_pretrained('bardsai/finance-sentiment-zh-base')
     nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
     counts = dict()
-    for p in pagesList:
+    for p in allsentences:
         res = nlp(p)[0]['label']
         counts[res] = counts.get(res, 0) + 1
 
     if 'negative' not in counts.keys():
         counts['negative'] = 0
 
-    sentiments = []
+
     for k in counts.keys():
         sentiments.append({"0 Sentiment": k, "1 Count": counts[k]})
 
+    sentiments = sorted(sentiments, key=lambda x: x["1 Count"], reverse=True)
     result = {'output': sentiments, 'message': 'Done', 'code': 'SUCCESS'}
+
     return result
 
+
+
+
+
+
diff --git a/func/translation/translation.py b/func/translation/translation.py
index 6a03d67..190bd6b 100644
--- a/func/translation/translation.py
+++ b/func/translation/translation.py
@@ -1,26 +1,30 @@
-import torch
-from ckip_transformers.nlp import CkipWordSegmenter, CkipPosTagger, CkipNerChunker
-from transformers import pipeline
-import pandas as pd
-
+import html_to_json
 from shared.translate import translate
 
-#page = 'å°¼ç½—æ²³ æ˜¯ä¸€æ¡æµç¶“éžæ´²æ±éƒ¨èˆ‡åŒ—éƒ¨çš„æ²³æµï¼Œèˆ‡ä¸éžåœ°å€çš„å‰›æžœæ²³ã€éžæ´²å—éƒ¨çš„èµžæ¯”è¥¿æ²³ä»¥åŠè¥¿éžåœ°åŒºçš„å°¼æ—¥å°”æ²³ä¸¦åˆ—éžæ´²æœ€å¤§çš„å››å€‹æ²³æµç³»çµ±ã€‚'
+
 # Perform NER on Text
 def run_translation_on_text(page):
-
-    translation = '<p>Translation</p>'
-    translation = translation + '<span>'
-    translation = translation + translate(page).text
-    translation = translation +'</span>'
-
-    result = {'output': translation,'message': 'Done', 'code': 'SUCCESS'}
-
-    return result
-
-
-
-
-
-
-
+    try:
+        output_json = html_to_json.convert_tables(page)
+        output = output_json[0]
+        translated = []
+
+        for item in output:
+            try:
+                translated_item = {}  # Use a proper dictionary to hold key-value pairs
+                for k, v in item.items():
+                    # Translate text and store in dictionary
+                    translated_item[k] = translate(v).text
+                    #time.sleep(1)  # Throttle API calls to avoid exhausting memory and network resources
+
+                translated.append(translated_item)
+            except Exception as e:
+                # Log or handle translation errors and continue processing other items
+                print(f"Error translating item {item}: {e}")
+
+
+        result = {'output': translated, 'message': 'Done', 'code': 'SUCCESS'}
+        return result
+    except Exception as e:
+        print(f"Error in run_translation_on_text: {e}")
+        return {'output': None, 'message': f'Error: {e}', 'code': 'FAILED'}
\ No newline at end of file
diff --git a/func/usas/usas.py b/func/usas/usas.py
index a5c6a19..bf72130 100644
--- a/func/usas/usas.py
+++ b/func/usas/usas.py
@@ -1,4 +1,5 @@
 import spacy
+from db.db_config import get_db
 
 
 # Perform USAS on Text
@@ -7,7 +8,7 @@ import spacy
 
 def run_usas_on_text(page):
     d = {}
-    with open('/Users/tom/PycharmProjects/cognistance/func/usas/usas_desc.txt') as f:
+    with open('/Users/tom/PycharmProjects/cognistance/func/usas/usas_overall.txt') as f:
         for line in f:
             lineL = line.replace('\n', '').split(' ', 1)
             key = lineL[0].strip()
@@ -22,35 +23,60 @@ def run_usas_on_text(page):
     # Adds the Chinese PyMUSAS rule-based tagger to the main spaCy pipeline
     nlp.add_pipe('pymusas_rule_based_tagger', source=chinese_tagger_pipeline)
 
-    output_doc = nlp(page)
-    #data = []
+    conn, cursor = get_db()
+    cursor.execute('SELECT * from news;')
+    res = cursor.fetchall()
+    data = []
 
+    for row in res:
+        docid = row[0]
+        content = row[-1].replace('\n', ' ').replace('\t', ' ')
+        data.append([docid, content])
+
+    #output_doc = nlp(page)
+    usas_tags_with_count = []
     tags = []
+    seen_tags = []
+    for i in range(0, len(data)):
+        id = data[i][0]
+        txt = data[i][1]
+        output_doc = nlp(txt)
 
-    for token in output_doc:
-        start, end = token._.pymusas_mwe_indexes[0]
-        idx = (start, end)
 
-        for el in token._.pymusas_tags:
-            el = el.split('.')[0]
-            #obj = {"word": token.text, "USAS Tags": el, "idx": idx}
-            tags.append(el)
-            #data.append(obj)
 
-    usas_tags_with_count = []
-    seen_tags = []
-    for tag in tags:
-        if tag not in seen_tags:
-            try:
-                freq = tags.count(tag)/1000000
-                usas_tags_with_count.append({"0 Tag": tag, "1 Definition":d[tag],"2 Frequency": freq})
-            except KeyError:
-                pass
-        seen_tags.append(tag)
-
-    usas_tags_with_count = sorted(usas_tags_with_count, key=lambda x: x["2 Frequency"], reverse=True)
-    result = {'output': usas_tags_with_count, 'message': 'Done', 'code': 'SUCCESS'}
+        for token in output_doc:
+            start, end = token._.pymusas_mwe_indexes[0]
+            idx = (start, end)
+
+            for el in token._.pymusas_tags:
+                el = el.split('.')[0][0]
+                #obj = {"word": token.text, "USAS Tags": el, "idx": idx}
+                tags.append(el)
+                #data.append(obj)
+
+
+
+        for tag in tags:
+            if tag not in seen_tags:
+                try:
+                    freq = tags.count(tag)/1000
+
+                    usas_tags_with_count.append({"0 Tag": tag, "1 Definition":d[tag],"2 Frequency": freq})
+                except KeyError:
+                    pass
+            seen_tags.append(tag)
+
+    usas_tags_with_count_sorted = sorted(usas_tags_with_count, key=lambda x: x["2 Frequency"], reverse=True)
+
+
+    result = {'output': usas_tags_with_count_sorted, 'message': 'Done', 'code': 'SUCCESS'}
 
     return result
 
 
+
+
+
+
+
+
diff --git a/func/usasFine/usasFine.py b/func/usasFine/usasFine.py
new file mode 100644
index 0000000..5255519
--- /dev/null
+++ b/func/usasFine/usasFine.py
@@ -0,0 +1,85 @@
+import spacy
+from db.db_config import get_db
+
+
+# Perform USAS on Text
+#page = 'å°¼ç½—æ²³ æ˜¯ä¸€æ¡æµç¶“éžæ´²æ±éƒ¨èˆ‡åŒ—éƒ¨çš„æ²³æµï¼Œèˆ‡ä¸éžåœ°å€çš„å‰›æžœæ²³ã€éžæ´²å—éƒ¨çš„èµžæ¯”è¥¿æ²³ä»¥åŠè¥¿éžåœ°åŒºçš„å°¼æ—¥å°”æ²³ä¸¦åˆ—éžæ´²æœ€å¤§çš„å››å€‹æ²³æµç³»çµ±ã€‚'
+
+
+def run_usasFine_on_text(page):
+    d = {}
+    with open('/Users/tom/PycharmProjects/cognistance/func/usas/usas_desc.txt') as f:
+        for line in f:
+            lineL = line.replace('\n', '').split(' ', 1)
+            key = lineL[0].strip()
+            val = lineL[1].strip()
+            d[key] = val
+
+    print(d)
+    # We exclude the following components as we do not need them.
+    nlp = spacy.load('zh_core_web_sm', exclude=['parser', 'ner'])
+    # Load the Chinese PyMUSAS rule-based tagger in a separate spaCy pipeline
+    chinese_tagger_pipeline = spacy.load('cmn_dual_upos2usas_contextual')
+    # Adds the Chinese PyMUSAS rule-based tagger to the main spaCy pipeline
+    nlp.add_pipe('pymusas_rule_based_tagger', source=chinese_tagger_pipeline)
+
+    conn, cursor = get_db()
+    cursor.execute('SELECT * from news;')
+    res = cursor.fetchall()
+    data = []
+
+    for row in res:
+        docid = row[0]
+        content = row[-1].replace('\n', ' ').replace('\t', ' ')
+        data.append([docid, content])
+
+    #output_doc = nlp(page)
+    usas_tags_with_count = []
+    tags = []
+    seen_tags = []
+    for i in range(0, len(data)):
+        id = data[i][0]
+        txt = data[i][1]
+        output_doc = nlp(txt)
+
+
+
+        for token in output_doc:
+            start, end = token._.pymusas_mwe_indexes[0]
+            idx = (start, end)
+
+            for el in token._.pymusas_tags:
+                el = el.split('.')[0]
+                #obj = {"word": token.text, "USAS Tags": el, "idx": idx}
+                #tags.append(el)
+                word = token.text
+                tags.append(word + '__' + el)
+                #data.append(obj)
+
+
+
+        for tag in tags:
+            if tag not in seen_tags:
+                try:
+                    freq = tags.count(tag)/1000
+                    word = tag.split('__')[0]
+                    usas = tag.split('__')[1]
+
+                    if 'A' in usas:
+                        tag_object = {"0 Word":word,"1 Discourse Field": usas, "2 Definition":d[usas],"3 Frequency": freq}
+
+                        usas_tags_with_count.append(tag_object)
+
+                except KeyError:
+                    pass
+            seen_tags.append(tag)
+
+    usas_tags_with_count_sorted = sorted(usas_tags_with_count, key=lambda x: x["3 Frequency"], reverse=False)[:30]
+    result = {'output': usas_tags_with_count_sorted, 'message': 'Done', 'code': 'SUCCESS'}
+
+    return result
+
+
+
+
+
diff --git a/main.py b/main.py
index 422a801..ea0ba98 100644
--- a/main.py
+++ b/main.py
@@ -45,6 +45,14 @@ def usas():
 
     return result
 
+@app.route("/usasFine", methods=['POST'])
+def usasFine():
+    request_data = request.get_json()
+    page = request_data['page']
+    result = get_usasFine_for_data(page)
+
+    return result
+
 @app.route("/sentiment", methods=['POST'])
 def sentiment():
 
-- 
GitLab