demo

f23a52a7 · Thomas Edwards · df73f011 · f23a52a7 · f23a52a7 · f23a52a7
Commit f23a52a7 authored 7 months ago by Thomas Edwards
--- a/api/api_functions.py
+++ b/api/api_functions.py
@@ -8,6 +8,7 @@ from func.collocation.collocation import *
 from func.concordance.concordance import *
 from func.mutlidatasets.multidatasets import *
 from func.neroverall.neroverall import *
+from func.usasFine.usasFine import *
 # Perform NER on a file
 # TAKES XML text page
@@ -44,6 +45,14 @@ def get_usas_for_data(page):
    return make_response(jsonify(result), 400)
+def get_usasFine_for_data(page):
+    result = run_usasFine_on_text(page)
+    if result["code"] == "SUCCESS":
+        return make_response(jsonify(result), 201)
+    return make_response(jsonify(result), 400)
 # Perform Sentiment analysis on a file
 # TAKES XML text page

--- a/func/collocation/collocation.py
+++ b/func/collocation/collocation.py
 import spacy
 import math
 from shared.translate import translate
+import nltk
 from nltk.collocations import TrigramCollocationFinder
 from nltk.metrics import TrigramAssocMeasures
 from nltk.collocations import BigramCollocationFinder
 from nltk.metrics import BigramAssocMeasures
+from db.db_config import get_db
-#page = '专精特新”企业，是指具有专业化、精细化、特色化、新颖化四大特征的中小企业。创新是这类企业的灵魂，足够的 研发费用投入则是开展创新的重要保障。许多尚处在成长期的“专精特新”企业，近期普遍遭遇“钱紧”难题。如何集聚 更多的资金投入研发、保持创新领先地位是这些企业近来面临的最大烦恼。“作为一家新材料研发公司，创新是我们发展的重要驱动力，只有研发投入的不断加码，企业创新发展的步伐才不会 降速。”浙江省“专精特新”企业、宁波创润新材料有限公司董事长吴景晖说，过去3年，企业在研发投入方面不遗余力，累计投入2500万元，这对企业来说不是个小数目。 今年新兴市场的研发需求十分迫切，我们一直想加快 超高纯钛及钛合金中试生产线项目 的研发进度，但苦于资金不 足。令人高兴的是，今年4月340万元存量增值税留抵税额的到账，有效缓解了企业的资金压力，加快了企业的研发 进度。”吴景晖说，目前，“超高纯钛及钛合金中试生产线项目”正在有序推进，一旦投产将缓解半导体产业的高纯钛原材料供应不足问题，提升国产溅射靶材的市场竞争力'
+def run_collocation_on_text(page):
+    collocations = []
+    nlp = spacy.load('zh_core_web_sm')
+    conn, cursor = get_db()
+    cursor.execute('SELECT * from news;')
+    res = cursor.fetchall()
+    data = []
-def clean(text):
+    for row in res:
-    text = text.replace('<p>', ' ')
+        docid = row[0]
-    text = text.replace('</p>', ' ')
-    text = text.replace('<br>', ' ')
-    text = text.replace('</br>', ' ')
-    text = text.replace('><', ' ')
-    text = text.replace('\u3000', ' ')
-    text = text.replace('br', ' ')
-    cltext = text.replace('\n', ' ').strip()
-    return str(cltext)
-def run_collocation_on_text(page):
+        content = row[-1].replace('\n', ' ').replace('\t', ' ')
-    page = clean(page)
+        data.append([docid, content])
    corpus = []
-    collocations = []
+    for i in range(0, len(data)):
+        id = data[i][0]
+        txt = data[i][1]
+        doc = nlp(txt)
-    nlp = spacy.load('zh_core_web_sm')
-    doc = nlp(page)
-    for token in doc:
+        for token in doc:
-        if not token.is_stop:
+            if not token.is_stop:
-            # print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,token.shape_, token.is_alpha, token.is_stop)
+                # print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,token.shape_, token.is_alpha, token.is_stop)
-            corpus.append(token.text.lower())
+                corpus.append(token.text.lower())
    biagram_collocation = BigramCollocationFinder.from_words(corpus)
    #biagram_collocation.apply_freq_filter(3)
-    trigram_collocation = TrigramCollocationFinder.from_words(corpus)
+    #trigram_collocation = TrigramCollocationFinder.from_words(corpus)
    #trigram_collocation.apply_freq_filter(3)
-    scoredbigrams = biagram_collocation.score_ngrams(BigramAssocMeasures().likelihood_ratio)[:10]
-    scoretrigrams = trigram_collocation.score_ngrams(TrigramAssocMeasures().likelihood_ratio)[:10]
+    scoredbigrams = biagram_collocation.score_ngrams(BigramAssocMeasures().likelihood_ratio)
-    allscores = scoredbigrams+scoretrigrams
+    bigramterms = []
-    for item in allscores:
+    for j in scoredbigrams:
+        jstr = " ".join(b for b in j[0])
+        bigramterms.append(jstr)
+    #scoretrigrams = trigram_collocation.score_ngrams(TrigramAssocMeasures().likelihood_ratio)
+    #allscores = scoredbigrams+scoretrigrams
+    for item in scoredbigrams:
        itemstr = " ".join(i for i in item[0])
-        translation = translate(itemstr).text.lower()
+        if '部' in itemstr:
-        score = item[1]/1000000
+            itemstrnew = itemstr.replace('部','').strip().replace(' ','')
-        collocations.append({"0 Term": itemstr,"1 Translation":translation ,"2 LogRatio": score})
+            translation = translate(itemstr.replace('部','').strip()).text.lower()
+            #print(translation)
+            #print('--------------')
+            score = round(item[1],3)
+            freq = bigramterms.count(itemstr)/ 1000
+            collocations.append({"0 Collocate": itemstrnew , "1 LogRatio": score, "2 Frequency":freq})
-    collocations = sorted(collocations, key=lambda x: x["2 LogRatio"], reverse=True)
+    collocationsorted = sorted(collocations, key=lambda x: (x["1 LogRatio"],x["2 Frequency"]), reverse=True)[:10]
-    result = {'output': collocations, 'message': 'Done', 'code': 'SUCCESS'}
+    result = {'output': collocationsorted, 'message': 'Done', 'code': 'SUCCESS'}
    return result
--- a/func/concordance/concordance.py
+++ b/func/concordance/concordance.py
 import spacy
-import math
-from collections import Counter, defaultdict
 from shared.translate import translate
 from wasabi import Printer
 from spacy.matcher import PhraseMatcher
-import re
+from db.db_config import get_db
-from nltk.collocations import TrigramCollocationFinder
-from nltk.metrics import TrigramAssocMeasures
 from nltk.collocations import BigramCollocationFinder
 from nltk.metrics import BigramAssocMeasures
-#page = '专精特新”企业，是指具有专业化、精细化、特色化、新颖化四大特征的中小企业。创新是这类企业的灵魂，足够的 研发费用投入则是开展创新的重要保障。许多尚处在成长期的“专精特新”企业，近期普遍遭遇“钱紧”难题。如何集聚 更多的资金投入研发、保持创新领先地位是这些企业近来面临的最大烦恼。“作为一家新材料研发公司，创新是我们发展的重要驱动力，只有研发投入的不断加码，企业创新发展的步伐才不会 降速。”浙江省“专精特新”企业、宁波创润新材料有限公司董事长吴景晖说，过去3年，企业在研发投入方面不遗余力，累计投入2500万元，这对企业来说不是个小数目。 今年新兴市场的研发需求十分迫切，我们一直想加快 超高纯钛及钛合金中试生产线项目 的研发进度，但苦于资金不 足。令人高兴的是，今年4月340万元存量增值税留抵税额的到账，有效缓解了企业的资金压力，加快了企业的研发 进度。”吴景晖说，目前，“超高纯钛及钛合金中试生产线项目”正在有序推进，一旦投产将缓解半导体产业的高纯钛原材料供应不足问题，提升国产溅射靶材的市场竞争力'
-def clean(text):
+def collocations():
+    collocations = []
-    text = text.replace('<p>', ' ')
+    nlp = spacy.load('zh_core_web_sm')
-    text = text.replace('</p>', ' ')
+    conn, cursor = get_db()
-    text = text.replace('<br>', ' ')
+    cursor.execute('SELECT * from news;')
-    text = text.replace('</br>', ' ')
+    res = cursor.fetchall()
-    text = text.replace('><', ' ')
-    text = text.replace('\u3000', ' ')
-    text = text.replace('br', ' ')
-    text = text.replace('——', '')
-    text = text.replace('[38;5;1m', '')
-    text = text.replace('[0m','')
-    cltext = text.replace('\n', ' ').strip()
-    return str(cltext)
-def collocations(doc):
+    data = []
-    corpus = []
-    collocations = []
-    for token in doc:
+    for row in res:
+        docid = row[0]
-        if not token.is_stop:
+        content = row[-1].replace('\n', ' ').replace('\t', ' ')
-            # print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,token.shape_, token.is_alpha, token.is_stop)
-            corpus.append(token.text.lower())
-    biagram_collocation = BigramCollocationFinder.from_words(corpus)
+        data.append([docid, content])
-    # biagram_collocation.apply_freq_filter(3)
-    trigram_collocation = TrigramCollocationFinder.from_words(corpus)
-    # trigram_collocation.apply_freq_filter(3)
-    scoredbigrams = biagram_collocation.score_ngrams(BigramAssocMeasures().likelihood_ratio)[:10]
+    corpus = []
+    for i in range(0, len(data)):
+        txt = data[i][1]
+        doc = nlp(txt)
-    scoretrigrams = trigram_collocation.score_ngrams(TrigramAssocMeasures().likelihood_ratio)[:10]
+        for token in doc:
-    allscores = scoredbigrams + scoretrigrams
+            if not token.is_stop:
-    for item in allscores:
+                # print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,token.shape_, token.is_alpha, token.is_stop)
-        itemstr = " ".join(i for i in item[0])
+                corpus.append(token.text.lower())
-        translation = translate(itemstr).text.lower()
-        score = item[1] / 1000000
+    biagram_collocation = BigramCollocationFinder.from_words(corpus)
-        collocations.append({"0 Term": itemstr, "1 Translation": translation, "2 LogRatio": score})
-    collocations = sorted(collocations, key=lambda x: x["2 LogRatio"], reverse=True)
+    scoredbigrams = biagram_collocation.score_ngrams(BigramAssocMeasures().likelihood_ratio)
+    bigramterms = []
+    for j in scoredbigrams:
+        jstr = " ".join(b for b in j[0])
+        bigramterms.append(jstr)
+    # scoretrigrams = trigram_collocation.score_ngrams(TrigramAssocMeasures().likelihood_ratio)
+    # allscores = scoredbigrams+scoretrigrams
+    for item in scoredbigrams:
+        itemstr = " ".join(i for i in item[0])
+        if '部' in itemstr:
+            itemstrnew = itemstr
+            translation = translate(itemstr).text.lower()
+            # print(translation)
+            # print('--------------')
+            score = round(item[1], 3)
-    terms = [item.get('0 Term') for item in collocations]
+            freq = bigramterms.count(itemstr) / 1000
+            collocations.append({"0 Collocate": itemstrnew, "1 LogRatio": score, "2 Frequency": freq})
+    collocationsorted = sorted(collocations, key=lambda x: (x["1 LogRatio"], x["2 Frequency"]), reverse=True)[:10]
-    return  terms
+    terms = [d['0 Collocate'] for d in collocationsorted]
+    return terms
 def run_concordance_on_text(page):
-    page = clean(page)
-    print('Page')
-    print(page)
    nlp = spacy.load('zh_core_web_sm')
-    doc = nlp(page)
+    conn, cursor = get_db()
-    terms = collocations(doc)
+    cursor.execute('SELECT * from news;')
+    res = cursor.fetchall()
+    data = []
+    for row in res:
+        docid = row[0]
+        content = row[-1].replace('\n', ' ').replace('\t', ' ')
+        data.append([docid, content])
    concordances = []
-    matcher = PhraseMatcher(nlp.vocab,attr='LOWER')
+    terms = collocations()
-    patterns = [nlp.make_doc(term) for term in terms]
+    for i in range(0, len(data)):
-    matcher.add("TermCollocations", patterns)
+        id = data[i][0]
+        txt = data[i][1]
-    matches = matcher(doc)
+        doc = nlp(txt)
-    match = Printer()
-    for i, start, end in matches:
-        perecedingSlice = doc[start - 20: start].text
+        matcher = PhraseMatcher(nlp.vocab,attr='LOWER')
-        if '。' in perecedingSlice:
+        patterns = [nlp.make_doc(term) for term in terms]
-            perecedingSlice = perecedingSlice.split('。')[1]
+        matcher.add("TermCollocations", patterns)
-        else:
-            perecedingSlice = perecedingSlice.strip()
+        matches = matcher(doc)
+        match = Printer()
+        for j, start, end in matches:
-        #perecedingSliceTr = clean(translate(doc[start - 20: start]).text)
+            perecedingSlice = doc[start - 20: start].text
-        matchedTerm = doc[start:end].text
+            if '。' in perecedingSlice:
-        print(matchedTerm)
+                perecedingSlice = perecedingSlice.split('。')[1]
-        #matchedTerm = doc[start:end].text
+            else:
-        matchedTermTr = match.text(translate(doc[start:end].text).text, color='red', no_print=True)
+                perecedingSlice = perecedingSlice.strip()
-        #matchedTermTr = match.text(translate(doc[start:end].text).text)
-        followingSlice = doc[end:end + 20].text
-        #followingSliceTr = clean(translate(doc[end:end + 20]).text)
+            #perecedingSliceTr = clean(translate(doc[start - 20: start]).text)
+            matchedTerm = doc[start:end].text
-        #context = perecedingSlice+', '+matchedTerm+', '+followingSlice
+            #matchedTerm = doc[start:end].text
+            matchedTermTr = match.text(translate(doc[start:end].text).text, color='red', no_print=True)
-        #contextTr = perecedingSliceTr+', '+matchedTermTr+', '+followingSliceTr
+            #matchedTermTr = match.text(translate(doc[start:end].text).text)
-        #concordances.append({"0 Term": escapeAnscii(matchedTerm), "1 Eng": escapeAnscii(matchedTermTr), "2 Context":escapeAnscii(context), "3 Context Eng":escapeAnscii(contextTr)})
+            followingSlice = doc[end:end + 20].text
-        concordances.append({"0 Preceded By":perecedingSlice,"1 Term": matchedTerm, "2 Followed By": followingSlice})
+            #followingSliceTr = clean(translate(doc[end:end + 20]).text)
+            #context = perecedingSlice+', '+matchedTerm+', '+followingSlice
+            #contextTr = perecedingSliceTr+', '+matchedTermTr+', '+followingSliceTr
+            #concordances.append({"0 Term": escapeAnscii(matchedTerm), "1 Eng": escapeAnscii(matchedTermTr), "2 Context":escapeAnscii(context), "3 Context Eng":escapeAnscii(contextTr)})
+            concordances.append({"0 Preceded By":perecedingSlice,"1 Term": matchedTerm, "2 Followed By": followingSlice})
    result = {'output': concordances, 'message': 'Done', 'code': 'SUCCESS'}
    return result
-#def main():
-#    result = run_concordance_on_text(page)
-#main()
--- a/func/mutlidatasets/multidatasets.py
+++ b/func/mutlidatasets/multidatasets.py
@@ -12,7 +12,7 @@ def run_multidatasets():
    data = []
    for row in res:
        print(row)
-        data.append({"0 Id": row[0], "1 Title": row[1], "2 Date": row[4],"3 Content":row[-1]})
+        data.append({"0 Title": row[1], "1 Date": row[4]})

--- a/func/ner/ner.py
+++ b/func/ner/ner.py
@@ -2,42 +2,54 @@ import torch
 from ckip_transformers.nlp import CkipWordSegmenter, CkipPosTagger, CkipNerChunker
 from transformers import pipeline
 import pandas as pd
+from db.db_config import get_db
-from shared.translate import translate
+#from shared.translate import translate
 #page = '尼罗河 是一条流經非洲東部與北部的河流，與中非地區的剛果河、非洲南部的赞比西河以及西非地区的尼日尔河並列非洲最大的四個河流系統。'
 # Perform NER on Text
 def run_ner_on_text(page):
    ner_driver = CkipNerChunker(model="bert-base")
+    conn, cursor = get_db()
+    cursor.execute('SELECT * from news;')
+    res = cursor.fetchall()
+    data = []
+    for row in res:
+        docid = row[0]
+        content = row[-1].replace('\n', ' ').replace('\t', ' ')
+        data.append([docid, content])
+    ner_words_with_count = []
+    for i in range(0, len(data)):
+        id = data[i][0]
+        txt = data[i][1]
+        ner = ner_driver([txt])
-    ner = ner_driver([page])
+        tags = []
+        for item in ner[0]:
+            word = item.word
+            ner = item.ner
+            tags.append(word+'__'+ner)
-    tags = []
-    for item in ner[0]:
-        word = item.word
-        ner = item.ner
-        #idx = item.idx
-        tags.append(word+'__'+ner)
+        seen_words = []
+        for tag in tags:
+            if tag not in seen_words:
-    ner_words_with_count = []
+                freq = tags.count(tag) / 1000
-    seen_words = []
+                word = tag.split('__')[0]
-    for tag in tags:
+                ner = tag.split('__')[1]
-        if tag not in seen_words:
+                #translation = translate(word).text
+                if ner == 'PERSON':
+                    ner_words_with_count.append({"0 Word": word, "1 NER": ner, "2 Frequency": freq})
+            seen_words.append(tag)
-            freq = tags.count(tag) / 1000000
-            word = tag.split('__')[0]
-            ner = tag.split('__')[1]
-            translation = translate(word).text
-            ner_words_with_count.append({"0 Word": word, "1 Translation":translation, "2 NER": ner, "3 Frequency": freq})
-        seen_words.append(tag)
+        perdoc = sorted(ner_words_with_count, key=lambda x: x["2 Frequency"], reverse=True)[:30]
-    data = sorted(ner_words_with_count, key=lambda x: x["3 Frequency"], reverse=True)
-    result = {'output': data,'message': 'Done', 'code': 'SUCCESS'}
+    result = {'output': perdoc,'message': 'Done', 'code': 'SUCCESS'}
    return result
@@ -47,3 +59,7 @@ def run_ner_on_text(page):
--- a/func/neroverall/neroverall.py
+++ b/func/neroverall/neroverall.py
@@ -16,39 +16,42 @@ def run_neroverall_on_text(page):
        data.append([docid, content])
    ner_with_count = []
+    tags = []
+    ners = []
+    seen_words = []
+    seen_tags = []
    for i in range(0,len(data)):
        id = data[i][0]
        txt = data[i][1]
        ner = ner_driver([txt])
-        tags = []
        for item in ner[0]:
            word = item.word
            ner = item.ner
            tags.append(word + '__' + ner)
-        ners = []
-        seen_words = []
-        seen_tags = []
        for tag in tags:
            if tag not in seen_words:
                ner = tag.split('__')[1].strip()
                ners.append(ner)
            seen_words.append(tag)
        for n in ners:
            if n not in seen_tags:
-                freq = ners.count(n) / 1000000
+                freq = ners.count(n) / 1000
-                ner_with_count.append({"0 Doc Id": id, "1 NER": n, "2 Frequency": freq})
+                ner_with_count.append({"1 NER": n, "2 Frequency": freq})
            seen_tags.append(n)
-    #nerall = sorted(ner_with_count, key=lambda x: (x["0 Doc Id"],x["2 Frequency"]), reverse=True)
+    nerall = sorted(ner_with_count, key=lambda x: x["2 Frequency"], reverse=True)
-    result = {'output': ner_with_count,'message': 'Done', 'code': 'SUCCESS'}
+    result = {'output': nerall,'message': 'Done', 'code': 'SUCCESS'}
    return result
--- a/func/sentiment/sentiment.py
+++ b/func/sentiment/sentiment.py
@@ -8,30 +8,53 @@ from transformers import (
 from transformers import pipeline
 import re
+from db.db_config import get_db
 def zng(paragraph):
    for sent in re.findall(u'[^!?。\.\!\?]+[!?。\.\!\?]?', paragraph, flags=re.U):
        yield sent
-#page = '尼罗河 是一条流經非洲東部與北部的河流，與中非地區的剛果河、非洲南部的赞比西河以及西非地区的尼日尔河並列非洲最大的四個河流系統。尼罗河 是一条流經非洲東部與北部的河流，與中非地區的剛果河、非洲南部的赞比西河以及西非地区的尼日尔河並列非洲最大的四個河流系統。'
-def run_sentiment_on_text(page):
-    pagesList = list(zng(page))
+def run_sentiment_on_text(page):
+    conn, cursor = get_db()
+    cursor.execute('SELECT * from news;')
+    res = cursor.fetchall()
+    data = []
+    sentiments = []
+    for row in res:
+        docid = row[0]
+        content = row[-1].replace('\n', ' ').replace('\t', ' ')
+        data.append([docid, content])
+    allsentences = []
+    for i in range(0, len(data)):
+        txt = data[i][1]
+        pagesList = list(zng(txt))
+        allsentences.append(pagesList)
+    allsentences = [x for xs in allsentences for x in xs]
    tokenizer = BertTokenizerFast.from_pretrained('bardsai/finance-sentiment-zh-base')
    model = AutoModelForSequenceClassification.from_pretrained('bardsai/finance-sentiment-zh-base')
    nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
    counts = dict()
-    for p in pagesList:
+    for p in allsentences:
        res = nlp(p)[0]['label']
        counts[res] = counts.get(res, 0) + 1
    if 'negative' not in counts.keys():
        counts['negative'] = 0
-    sentiments = []
    for k in counts.keys():
        sentiments.append({"0 Sentiment": k, "1 Count": counts[k]})
+    sentiments = sorted(sentiments, key=lambda x: x["1 Count"], reverse=True)
    result = {'output': sentiments, 'message': 'Done', 'code': 'SUCCESS'}
    return result
--- a/func/translation/translation.py
+++ b/func/translation/translation.py
-import torch
+import html_to_json
-from ckip_transformers.nlp import CkipWordSegmenter, CkipPosTagger, CkipNerChunker
-from transformers import pipeline
-import pandas as pd
 from shared.translate import translate
-#page = '尼罗河 是一条流經非洲東部與北部的河流，與中非地區的剛果河、非洲南部的赞比西河以及西非地区的尼日尔河並列非洲最大的四個河流系統。'
 # Perform NER on Text
 def run_translation_on_text(page):
+    try:
-    translation = '<p>Translation</p>'
+        output_json = html_to_json.convert_tables(page)
-    translation = translation + '<span>'
+        output = output_json[0]
-    translation = translation + translate(page).text
+        translated = []
-    translation = translation +'</span>'
+        for item in output:
-    result = {'output': translation,'message': 'Done', 'code': 'SUCCESS'}
+            try:
+                translated_item = {}  # Use a proper dictionary to hold key-value pairs
-    return result
+                for k, v in item.items():
+                    # Translate text and store in dictionary
+                    translated_item[k] = translate(v).text
+                    #time.sleep(1)  # Throttle API calls to avoid exhausting memory and network resources
+                translated.append(translated_item)
+            except Exception as e:
+                # Log or handle translation errors and continue processing other items
+                print(f"Error translating item {item}: {e}")
+        result = {'output': translated, 'message': 'Done', 'code': 'SUCCESS'}
+        return result
+    except Exception as e:
+        print(f"Error in run_translation_on_text: {e}")
+        return {'output': None, 'message': f'Error: {e}', 'code': 'FAILED'}
\ No newline at end of file
--- a/func/usas/usas.py
+++ b/func/usas/usas.py
 import spacy
+from db.db_config import get_db
 # Perform USAS on Text
@@ -7,7 +8,7 @@ import spacy
 def run_usas_on_text(page):
    d = {}
-    with open('/Users/tom/PycharmProjects/cognistance/func/usas/usas_desc.txt') as f:
+    with open('/Users/tom/PycharmProjects/cognistance/func/usas/usas_overall.txt') as f:
        for line in f:
            lineL = line.replace('\n', '').split(' ', 1)
            key = lineL[0].strip()
@@ -22,35 +23,60 @@ def run_usas_on_text(page):
    # Adds the Chinese PyMUSAS rule-based tagger to the main spaCy pipeline
    nlp.add_pipe('pymusas_rule_based_tagger', source=chinese_tagger_pipeline)
-    output_doc = nlp(page)
+    conn, cursor = get_db()
-    #data = []
+    cursor.execute('SELECT * from news;')
+    res = cursor.fetchall()
+    data = []
+    for row in res:
+        docid = row[0]
+        content = row[-1].replace('\n', ' ').replace('\t', ' ')
+        data.append([docid, content])
+    #output_doc = nlp(page)
+    usas_tags_with_count = []
    tags = []
+    seen_tags = []
+    for i in range(0, len(data)):
+        id = data[i][0]
+        txt = data[i][1]
+        output_doc = nlp(txt)
-    for token in output_doc:
-        start, end = token._.pymusas_mwe_indexes[0]
-        idx = (start, end)
-        for el in token._.pymusas_tags:
-            el = el.split('.')[0]
-            #obj = {"word": token.text, "USAS Tags": el, "idx": idx}
-            tags.append(el)
-            #data.append(obj)
-    usas_tags_with_count = []
+        for token in output_doc:
-    seen_tags = []
+            start, end = token._.pymusas_mwe_indexes[0]
-    for tag in tags:
+            idx = (start, end)
-        if tag not in seen_tags:
-            try:
+            for el in token._.pymusas_tags:
-                freq = tags.count(tag)/1000000
+                el = el.split('.')[0][0]
-                usas_tags_with_count.append({"0 Tag": tag, "1 Definition":d[tag],"2 Frequency": freq})
+                #obj = {"word": token.text, "USAS Tags": el, "idx": idx}
-            except KeyError:
+                tags.append(el)
-                pass
+                #data.append(obj)
-        seen_tags.append(tag)
-    usas_tags_with_count = sorted(usas_tags_with_count, key=lambda x: x["2 Frequency"], reverse=True)
-    result = {'output': usas_tags_with_count, 'message': 'Done', 'code': 'SUCCESS'}
+        for tag in tags:
+            if tag not in seen_tags:
+                try:
+                    freq = tags.count(tag)/1000
+                    usas_tags_with_count.append({"0 Tag": tag, "1 Definition":d[tag],"2 Frequency": freq})
+                except KeyError:
+                    pass
+            seen_tags.append(tag)
+    usas_tags_with_count_sorted = sorted(usas_tags_with_count, key=lambda x: x["2 Frequency"], reverse=True)
+    result = {'output': usas_tags_with_count_sorted, 'message': 'Done', 'code': 'SUCCESS'}
    return result
--- a/func/usasFine/usasFine.py
+++ b/func/usasFine/usasFine.py
+import spacy
+from db.db_config import get_db
+# Perform USAS on Text
+#page = '尼罗河 是一条流經非洲東部與北部的河流，與中非地區的剛果河、非洲南部的赞比西河以及西非地区的尼日尔河並列非洲最大的四個河流系統。'
+def run_usasFine_on_text(page):
+    d = {}
+    with open('/Users/tom/PycharmProjects/cognistance/func/usas/usas_desc.txt') as f:
+        for line in f:
+            lineL = line.replace('\n', '').split(' ', 1)
+            key = lineL[0].strip()
+            val = lineL[1].strip()
+            d[key] = val
+    print(d)
+    # We exclude the following components as we do not need them.
+    nlp = spacy.load('zh_core_web_sm', exclude=['parser', 'ner'])
+    # Load the Chinese PyMUSAS rule-based tagger in a separate spaCy pipeline
+    chinese_tagger_pipeline = spacy.load('cmn_dual_upos2usas_contextual')
+    # Adds the Chinese PyMUSAS rule-based tagger to the main spaCy pipeline
+    nlp.add_pipe('pymusas_rule_based_tagger', source=chinese_tagger_pipeline)
+    conn, cursor = get_db()
+    cursor.execute('SELECT * from news;')
+    res = cursor.fetchall()
+    data = []
+    for row in res:
+        docid = row[0]
+        content = row[-1].replace('\n', ' ').replace('\t', ' ')
+        data.append([docid, content])
+    #output_doc = nlp(page)
+    usas_tags_with_count = []
+    tags = []
+    seen_tags = []
+    for i in range(0, len(data)):
+        id = data[i][0]
+        txt = data[i][1]
+        output_doc = nlp(txt)
+        for token in output_doc:
+            start, end = token._.pymusas_mwe_indexes[0]
+            idx = (start, end)
+            for el in token._.pymusas_tags:
+                el = el.split('.')[0]
+                #obj = {"word": token.text, "USAS Tags": el, "idx": idx}
+                #tags.append(el)
+                word = token.text
+                tags.append(word + '__' + el)
+                #data.append(obj)
+        for tag in tags:
+            if tag not in seen_tags:
+                try:
+                    freq = tags.count(tag)/1000
+                    word = tag.split('__')[0]
+                    usas = tag.split('__')[1]
+                    if 'A' in usas:
+                        tag_object = {"0 Word":word,"1 Discourse Field": usas, "2 Definition":d[usas],"3 Frequency": freq}
+                        usas_tags_with_count.append(tag_object)
+                except KeyError:
+                    pass
+            seen_tags.append(tag)
+    usas_tags_with_count_sorted = sorted(usas_tags_with_count, key=lambda x: x["3 Frequency"], reverse=False)[:30]
+    result = {'output': usas_tags_with_count_sorted, 'message': 'Done', 'code': 'SUCCESS'}
+    return result
--- a/main.py
+++ b/main.py
@@ -45,6 +45,14 @@ def usas():
    return result
+@app.route("/usasFine", methods=['POST'])
+def usasFine():
+    request_data = request.get_json()
+    page = request_data['page']
+    result = get_usasFine_for_data(page)
+    return result
 @app.route("/sentiment", methods=['POST'])
 def sentiment():