diff --git a/api/api_functions.py b/api/api_functions.py
index 2b83b6da513fb4a19dec90c8347fb2b0043e1678..61afca16a24551a5e931452adc6fc3c34ddf6b26 100644
--- a/api/api_functions.py
+++ b/api/api_functions.py
@@ -6,7 +6,8 @@ from func.translation.translation import run_translation_on_text
 from func.usas.usas import *
 from func.collocation.collocation import *
 from func.concordance.concordance import *
-
+from func.mutlidatasets.multidatasets import *
+from func.neroverall.neroverall import *
 
 # Perform NER on a file
 # TAKES XML text page
@@ -71,3 +72,20 @@ def get_concordance_for_data(page):
 
     return make_response(jsonify(result), 400)
 
+def run_multidatasets_all():
+    result = run_multidatasets()
+
+    if result["code"] == "SUCCESS":
+        return make_response(jsonify(result), 201)
+
+    return make_response(jsonify(result), 400)
+
+def run_neroverall_all(page):
+    result = run_neroverall_on_text(page)
+
+    if result["code"] == "SUCCESS":
+        return make_response(jsonify(result), 201)
+
+    return make_response(jsonify(result), 400)
+
+
diff --git a/db/db_config.py b/db/db_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..f73a4d252074c4f120c1fe89ef7724142861ab3c
--- /dev/null
+++ b/db/db_config.py
@@ -0,0 +1,8 @@
+import sqlite3
+
+
+def get_db():
+
+    conn = sqlite3.connect('/Users/tom/PycharmProjects/cognistance/db/chinesedata.db')
+    cursor = conn.cursor()
+    return conn, cursor
\ No newline at end of file
diff --git a/db/db_creation.py b/db/db_creation.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6f22a348808ea05633e2945c455e56faf7dac2c
--- /dev/null
+++ b/db/db_creation.py
@@ -0,0 +1,30 @@
+import sqlite3
+
+# Connect to an SQLite database (or create it if it doesn't exist)
+conn = sqlite3.connect('chinesedata.db')
+
+# Create a cursor object using the cursor() method
+cursor = conn.cursor()
+
+# Create table
+# Create table
+cursor.execute('''CREATE TABLE IF NOT EXISTS news
+             (id integer primary key, title text, location text, pubyear text, pubdate text, websource text, loaddate text, content longtext)''')
+
+
+# Insert a row of data
+#cursor.execute("INSERT INTO stocks VALUES ('2006-01-05','BUY','RHAT',100,35.14)")
+
+# to select all column we will use
+statement = '''SELECT * FROM news'''
+
+cursor.execute(statement)
+print("All the data")
+output = cursor.fetchall()
+for row in output:
+  print(row)
+# Save (commit) the changes
+conn.commit()
+
+# Close the connection
+conn.close()
\ No newline at end of file
diff --git a/func/collocation/collocation.py b/func/collocation/collocation.py
index 85db37eb60b4dbec08d062d9db9d5f883b0c69a6..64396ea4a3cb3a6bd0aff58d359a01df25334b42 100644
--- a/func/collocation/collocation.py
+++ b/func/collocation/collocation.py
@@ -1,64 +1,65 @@
 import spacy
 import math
-from collections import Counter, defaultdict
 from shared.translate import translate
+from nltk.collocations import TrigramCollocationFinder
+from nltk.metrics import TrigramAssocMeasures
+from nltk.collocations import BigramCollocationFinder
+from nltk.metrics import BigramAssocMeasures
+
 
 #page = '专精特新”企业,是指具有专业化、精细化、特色化、新颖化四大特征的中小企业。创新是这类企业的灵魂,足够的 研发费用投入则是开展创新的重要保障。许多尚处在成长期的“专精特新”企业,近期普遍遭遇“钱紧”难题。如何集聚 更多的资金投入研发、保持创新领先地位是这些企业近来面临的最大烦恼。“作为一家新材料研发公司,创新是我们发展的重要驱动力,只有研发投入的不断加码,企业创新发展的步伐才不会 降速。”浙江省“专精特新”企业、宁波创润新材料有限公司董事长吴景晖说,过去3年,企业在研发投入方面不遗余力,累计投入2500万元,这对企业来说不是个小数目。 今年新兴市场的研发需求十分迫切,我们一直想加快 超高纯钛及钛合金中试生产线项目 的研发进度,但苦于资金不 足。令人高兴的是,今年4月340万元存量增值税留抵税额的到账,有效缓解了企业的资金压力,加快了企业的研发 进度。”吴景晖说,目前,“超高纯钛及钛合金中试生产线项目”正在有序推进,一旦投产将缓解半导体产业的高纯钛原材料供应不足问题,提升国产溅射靶材的市场竞争力'
 
-# Step 4: Calculate PMI for all Bigrams
-def calculate_pmi(bigram, p_bigram, p_word):
-    word1, word2 = bigram
-    return math.log2(max(p_bigram[bigram], 1e-10) / (max(p_word[word1], 1e-10) * max(p_word[word2], 1e-10)))
-
-def escape(token: str):
-    token = token.replace("&", " ")
-    token = token.replace("-", " ")
-    token = token.replace("<", " ")
-    token = token.replace(">", " ")
-    token = token.replace("\"", " ")
-    token = token.replace("'", " ")
-    token = token.strip()
-    return token
+
+
+
+def clean(text):
+
+    text = text.replace('<p>', ' ')
+    text = text.replace('</p>', ' ')
+    text = text.replace('<br>', ' ')
+    text = text.replace('</br>', ' ')
+    text = text.replace('><', ' ')
+    text = text.replace('\u3000', ' ')
+    text = text.replace('br', ' ')
+    cltext = text.replace('\n', ' ').strip()
+    return str(cltext)
 
 def run_collocation_on_text(page):
+
+    page = clean(page)
+
     corpus = []
+    collocations = []
 
     nlp = spacy.load('zh_core_web_sm')
     doc = nlp(page)
+
     for token in doc:
         if not token.is_stop:
+            # print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,token.shape_, token.is_alpha, token.is_stop)
+            corpus.append(token.text.lower())
 
-            #print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,token.shape_, token.is_alpha, token.is_stop)
-            corpus.append(escape(token.text.lower()))
-
-    # Step 2: Calculate Frequencies
-    word_freq = Counter(corpus)
-    bigram_freq = Counter(zip(corpus[:-1], corpus[1:]))
 
-    # Step 3: Calculate Probabilities
-    total_words = len(corpus)
-    p_word = defaultdict(float)
-    p_bigram = defaultdict(float)
+    biagram_collocation = BigramCollocationFinder.from_words(corpus)
+    #biagram_collocation.apply_freq_filter(3)
+    trigram_collocation = TrigramCollocationFinder.from_words(corpus)
+    #trigram_collocation.apply_freq_filter(3)
 
-    for word, freq in word_freq.items():
-        p_word[word] = freq / total_words
+    scoredbigrams = biagram_collocation.score_ngrams(BigramAssocMeasures().likelihood_ratio)[:10]
 
-    total_bigrams = len(corpus) - 1
-    all_pmi_scores = []
-    for bigram, freq in bigram_freq.items():
-        p_bigram[bigram] = freq / total_bigrams
-        bigramstr = bigram[0]+' '+bigram[1]
-        translation = translate(bigramstr).text.lower()
-        pmi = calculate_pmi(bigram, p_bigram, p_word)
+    scoretrigrams = trigram_collocation.score_ngrams(TrigramAssocMeasures().likelihood_ratio)[:10]
+    allscores = scoredbigrams+scoretrigrams
+    for item in allscores:
+        itemstr = " ".join(i for i in item[0])
+        translation = translate(itemstr).text.lower()
+        score = item[1]/1000000
+        collocations.append({"0 Term": itemstr,"1 Translation":translation ,"2 LogRatio": score})
 
-        all_pmi_scores.append({"0 Term": bigramstr,"1 Translation":translation ,"2 PMI Score": round(pmi,3)})
 
+    collocations = sorted(collocations, key=lambda x: x["2 LogRatio"], reverse=True)
 
 
-    all_pmi_score = sorted(all_pmi_scores, key=lambda x: x["2 PMI Score"], reverse=True)
-    all_pmi_score = all_pmi_score[slice(40)]
-    result = {'output': all_pmi_score, 'message': 'Done', 'code': 'SUCCESS'}
-    print("PMI Scores:", all_pmi_scores)
-
+    result = {'output': collocations, 'message': 'Done', 'code': 'SUCCESS'}
     return result
 
+
diff --git a/func/concordance/concordance.py b/func/concordance/concordance.py
index 966a1a6d589183ca7d94aa6071afc9c63bd5d351..c716d2008a54781645c4a176a35a8d7f33a10c18 100644
--- a/func/concordance/concordance.py
+++ b/func/concordance/concordance.py
@@ -5,77 +5,67 @@ from shared.translate import translate
 from wasabi import Printer
 from spacy.matcher import PhraseMatcher
 import re
+from nltk.collocations import TrigramCollocationFinder
+from nltk.metrics import TrigramAssocMeasures
+from nltk.collocations import BigramCollocationFinder
+from nltk.metrics import BigramAssocMeasures
 
 #page = '专精特新”企业,是指具有专业化、精细化、特色化、新颖化四大特征的中小企业。创新是这类企业的灵魂,足够的 研发费用投入则是开展创新的重要保障。许多尚处在成长期的“专精特新”企业,近期普遍遭遇“钱紧”难题。如何集聚 更多的资金投入研发、保持创新领先地位是这些企业近来面临的最大烦恼。“作为一家新材料研发公司,创新是我们发展的重要驱动力,只有研发投入的不断加码,企业创新发展的步伐才不会 降速。”浙江省“专精特新”企业、宁波创润新材料有限公司董事长吴景晖说,过去3年,企业在研发投入方面不遗余力,累计投入2500万元,这对企业来说不是个小数目。 今年新兴市场的研发需求十分迫切,我们一直想加快 超高纯钛及钛合金中试生产线项目 的研发进度,但苦于资金不 足。令人高兴的是,今年4月340万元存量增值税留抵税额的到账,有效缓解了企业的资金压力,加快了企业的研发 进度。”吴景晖说,目前,“超高纯钛及钛合金中试生产线项目”正在有序推进,一旦投产将缓解半导体产业的高纯钛原材料供应不足问题,提升国产溅射靶材的市场竞争力'
 
-def calculate_pmi(bigram, p_bigram, p_word):
-    word1, word2 = bigram
-    return math.log2(max(p_bigram[bigram], 1e-10) / (max(p_word[word1], 1e-10) * max(p_word[word2], 1e-10)))
 
-def escape(token: str):
-    token = token.replace("&", " ")
-    token = token.replace("-", " ")
-    token = token.replace("<", " ")
-    token = token.replace(">", " ")
-    token = token.replace("\"", " ")
-    token = token.replace("'", " ")
-    token = token.strip()
-    return token
+def clean(text):
 
-def collocations(doc):
+    text = text.replace('<p>', ' ')
+    text = text.replace('</p>', ' ')
+    text = text.replace('<br>', ' ')
+    text = text.replace('</br>', ' ')
+    text = text.replace('><', ' ')
+    text = text.replace('\u3000', ' ')
+    text = text.replace('br', ' ')
+    text = text.replace('——', '')
+    text = text.replace('[38;5;1m', '')
+    text = text.replace('[0m','')
+    cltext = text.replace('\n', ' ').strip()
+    return str(cltext)
 
+def collocations(doc):
     corpus = []
+    collocations = []
 
     for token in doc:
-        if not token.is_stop:
-            corpus.append(escape(token.text.lower()))
 
-    # Step 2: Calculate Frequencies
-    word_freq = Counter(corpus)
-    bigram_freq = Counter(zip(corpus[:-1], corpus[1:]))
+        if not token.is_stop:
+            # print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,token.shape_, token.is_alpha, token.is_stop)
+            corpus.append(token.text.lower())
 
-    # Step 3: Calculate Probabilities
-    total_words = len(corpus)
-    p_word = defaultdict(float)
-    p_bigram = defaultdict(float)
+    biagram_collocation = BigramCollocationFinder.from_words(corpus)
+    # biagram_collocation.apply_freq_filter(3)
+    trigram_collocation = TrigramCollocationFinder.from_words(corpus)
+    # trigram_collocation.apply_freq_filter(3)
 
-    for word, freq in word_freq.items():
-        p_word[word] = freq / total_words
+    scoredbigrams = biagram_collocation.score_ngrams(BigramAssocMeasures().likelihood_ratio)[:10]
 
-    total_bigrams = len(corpus) - 1
-    all_pmi_scores = []
-    for bigram, freq in bigram_freq.items():
-        p_bigram[bigram] = freq / total_bigrams
-        bigramstr = bigram[0] + ' ' + bigram[1]
+    scoretrigrams = trigram_collocation.score_ngrams(TrigramAssocMeasures().likelihood_ratio)[:10]
+    allscores = scoredbigrams + scoretrigrams
+    for item in allscores:
+        itemstr = " ".join(i for i in item[0])
+        translation = translate(itemstr).text.lower()
+        score = item[1] / 1000000
+        collocations.append({"0 Term": itemstr, "1 Translation": translation, "2 LogRatio": score})
 
-        pmi = calculate_pmi(bigram, p_bigram, p_word)
+    collocations = sorted(collocations, key=lambda x: x["2 LogRatio"], reverse=True)
 
-        all_pmi_scores.append({"0 Term": bigramstr, "2 PMI Score": round(pmi, 3)})
 
-    all_pmi_score = sorted(all_pmi_scores, key=lambda x: x["2 PMI Score"], reverse=True)
-    all_pmi_score = all_pmi_score[slice(40)]
-    terms = [item.get('0 Term') for item in all_pmi_score]
+    terms = [item.get('0 Term') for item in collocations]
 
 
     return  terms
 
-def clean(text):
-    ansi_escape = re.compile(r'''
-        \x1B  # ESC
-        (?:   # 7-bit C1 Fe (except CSI)
-            [@-Z\\-_]
-        |     # or [ for CSI, followed by a control sequence
-            \[
-            [0-?]*  # Parameter bytes
-            [ -/]*  # Intermediate bytes
-            [@-~]   # Final byte
-        )
-    ''', re.VERBOSE)
-    cltext = ansi_escape.sub('', text)
-
-    return str(cltext)
 
 def run_concordance_on_text(page):
+    page = clean(page)
+    print('Page')
+    print(page)
     nlp = spacy.load('zh_core_web_sm')
     doc = nlp(page)
     terms = collocations(doc)
@@ -87,24 +77,29 @@ def run_concordance_on_text(page):
     matches = matcher(doc)
     match = Printer()
     for i, start, end in matches:
-        perecedingSlice = clean(doc[start - 7: start].text)
+        perecedingSlice = doc[start - 20: start].text
+        if '。' in perecedingSlice:
+            perecedingSlice = perecedingSlice.split('。')[1]
+        else:
+            perecedingSlice = perecedingSlice.strip()
 
 
-        perecedingSliceTr = clean(translate(doc[start - 7: start]).text)
-        matchedTerm = clean(match.text(doc[start:end].text, color='red', no_print=True))
+        #perecedingSliceTr = clean(translate(doc[start - 20: start]).text)
+        matchedTerm = doc[start:end].text
+        print(matchedTerm)
         #matchedTerm = doc[start:end].text
-        matchedTermTr = clean(match.text(translate(doc[start:end].text).text, color='red', no_print=True))
+        matchedTermTr = match.text(translate(doc[start:end].text).text, color='red', no_print=True)
         #matchedTermTr = match.text(translate(doc[start:end].text).text)
-        followingSlice = clean(doc[end:end + 7].text)
-        followingSliceTr = clean(translate(doc[end:end + 7]).text)
+        followingSlice = doc[end:end + 20].text
+        #followingSliceTr = clean(translate(doc[end:end + 20]).text)
 
-        context = perecedingSlice+', '+matchedTerm+', '+followingSlice
+        #context = perecedingSlice+', '+matchedTerm+', '+followingSlice
 
-        contextTr = perecedingSliceTr+', '+matchedTermTr+', '+followingSliceTr
+        #contextTr = perecedingSliceTr+', '+matchedTermTr+', '+followingSliceTr
         #concordances.append({"0 Term": escapeAnscii(matchedTerm), "1 Eng": escapeAnscii(matchedTermTr), "2 Context":escapeAnscii(context), "3 Context Eng":escapeAnscii(contextTr)})
-        concordances.append({"0 Term": matchedTerm, "1 Eng": matchedTermTr,"2 Context": context, "3 Context Eng": contextTr})
+        concordances.append({"0 Preceded By":perecedingSlice,"1 Term": matchedTerm, "2 Followed By": followingSlice})
 
-    print(concordances)
+   
     result = {'output': concordances, 'message': 'Done', 'code': 'SUCCESS'}
 
     return result
diff --git a/func/mutlidatasets/chinesedata.db b/func/mutlidatasets/chinesedata.db
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/func/mutlidatasets/multidatasets.py b/func/mutlidatasets/multidatasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..00c845c2cc82ac723a948c4343860e4827f684e6
--- /dev/null
+++ b/func/mutlidatasets/multidatasets.py
@@ -0,0 +1,23 @@
+import pandas as pd
+from db.db_config import get_db
+
+
+
+#page = '尼罗河 是一条流經非洲東部與北部的河流,與中非地區的剛果河、非洲南部的赞比西河以及西非地区的尼日尔河並列非洲最大的四個河流系統。'
+# Perform NER on Text
+def run_multidatasets():
+    conn, cursor = get_db()
+    cursor.execute('SELECT * from news;')
+    res = cursor.fetchall()
+    data = []
+    for row in res:
+        print(row)
+        data.append({"0 Id": row[0], "1 Title": row[1], "2 Date": row[4],"3 Content":row[-1]})
+
+
+
+
+
+    result = {'output': data,'message': 'Done', 'code': 'SUCCESS'}
+
+    return result
diff --git a/func/neroverall/neroverall.py b/func/neroverall/neroverall.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c22352fe55021840878b41643078e37669346b4
--- /dev/null
+++ b/func/neroverall/neroverall.py
@@ -0,0 +1,54 @@
+from ckip_transformers.nlp import CkipNerChunker
+from db.db_config import get_db
+
+
+#page = '尼罗河 是一条流經非洲東部與北部的河流,與中非地區的剛果河、非洲南部的赞比西河以及西非地区的尼日尔河並列非洲最大的四個河流系統。'
+# Perform NER on Text
+def run_neroverall_on_text(page):
+    ner_driver = CkipNerChunker(model="bert-base")
+    conn, cursor = get_db()
+    cursor.execute('SELECT * from news;')
+    res = cursor.fetchall()
+    data = []
+    for row in res:
+        docid = row[0]
+        content = row[-1].replace('\n', ' ').replace('\t', ' ')
+        data.append([docid, content])
+
+    ner_with_count = []
+    for i in range(0,len(data)):
+        id = data[i][0]
+        txt = data[i][1]
+        ner = ner_driver([txt])
+        tags = []
+        for item in ner[0]:
+            word = item.word
+            ner = item.ner
+            tags.append(word + '__' + ner)
+
+        ners = []
+
+        seen_words = []
+        seen_tags = []
+        for tag in tags:
+            if tag not in seen_words:
+                ner = tag.split('__')[1].strip()
+
+                ners.append(ner)
+
+            seen_words.append(tag)
+
+        for n in ners:
+            if n not in seen_tags:
+                freq = ners.count(n) / 1000000
+                ner_with_count.append({"0 Doc Id": id, "1 NER": n, "2 Frequency": freq})
+
+            seen_tags.append(n)
+
+    #nerall = sorted(ner_with_count, key=lambda x: (x["0 Doc Id"],x["2 Frequency"]), reverse=True)
+
+    result = {'output': ner_with_count,'message': 'Done', 'code': 'SUCCESS'}
+    
+    return result
+
+
diff --git a/func/usas/usas.py b/func/usas/usas.py
index 3a73dc60b65c18f781909c54617eb4fe05a45358..a5c6a190bd80701358148e0fac7fc3d96e73d454 100644
--- a/func/usas/usas.py
+++ b/func/usas/usas.py
@@ -14,6 +14,7 @@ def run_usas_on_text(page):
             val = lineL[1].strip()
             d[key] = val
 
+
     # We exclude the following components as we do not need them.
     nlp = spacy.load('zh_core_web_sm', exclude=['parser', 'ner'])
     # Load the Chinese PyMUSAS rule-based tagger in a separate spaCy pipeline
@@ -31,6 +32,7 @@ def run_usas_on_text(page):
         idx = (start, end)
 
         for el in token._.pymusas_tags:
+            el = el.split('.')[0]
             #obj = {"word": token.text, "USAS Tags": el, "idx": idx}
             tags.append(el)
             #data.append(obj)
@@ -51,3 +53,4 @@ def run_usas_on_text(page):
 
     return result
 
+
diff --git a/main.py b/main.py
index 5eec639603704766370978254d88bf5b2942a3a3..422a801e460f3881b3a2dd97c1f5f20773d5cbb3 100644
--- a/main.py
+++ b/main.py
@@ -71,3 +71,21 @@ def concordance():
     result = get_concordance_for_data(page)
 
     return result
+
+@app.route("/multidatasets", methods=['POST'])
+def multidatasets():
+
+    #request_data = request.get_json()
+    #page = request_data['page']
+    result = run_multidatasets_all()
+
+    return result
+
+@app.route("/neroverall", methods=['POST'])
+def neroverall():
+
+    request_data = request.get_json()
+    page = request_data['page']
+    result = run_neroverall_all(page)
+
+    return result
\ No newline at end of file