Skip to content
Snippets Groups Projects
Commit 490e54d9 authored by Thomas Edwards's avatar Thomas Edwards
Browse files

Work towards collocation and concordance

parent ea282d9d
No related branches found
No related tags found
No related merge requests found
...@@ -6,7 +6,8 @@ from func.translation.translation import run_translation_on_text ...@@ -6,7 +6,8 @@ from func.translation.translation import run_translation_on_text
from func.usas.usas import * from func.usas.usas import *
from func.collocation.collocation import * from func.collocation.collocation import *
from func.concordance.concordance import * from func.concordance.concordance import *
from func.mutlidatasets.multidatasets import *
from func.neroverall.neroverall import *
# Perform NER on a file # Perform NER on a file
# TAKES XML text page # TAKES XML text page
...@@ -71,3 +72,20 @@ def get_concordance_for_data(page): ...@@ -71,3 +72,20 @@ def get_concordance_for_data(page):
return make_response(jsonify(result), 400) return make_response(jsonify(result), 400)
def run_multidatasets_all():
result = run_multidatasets()
if result["code"] == "SUCCESS":
return make_response(jsonify(result), 201)
return make_response(jsonify(result), 400)
def run_neroverall_all(page):
result = run_neroverall_on_text(page)
if result["code"] == "SUCCESS":
return make_response(jsonify(result), 201)
return make_response(jsonify(result), 400)
import sqlite3
def get_db():
conn = sqlite3.connect('/Users/tom/PycharmProjects/cognistance/db/chinesedata.db')
cursor = conn.cursor()
return conn, cursor
\ No newline at end of file
import sqlite3
# Connect to an SQLite database (or create it if it doesn't exist)
conn = sqlite3.connect('chinesedata.db')
# Create a cursor object using the cursor() method
cursor = conn.cursor()
# Create table
# Create table
cursor.execute('''CREATE TABLE IF NOT EXISTS news
(id integer primary key, title text, location text, pubyear text, pubdate text, websource text, loaddate text, content longtext)''')
# Insert a row of data
#cursor.execute("INSERT INTO stocks VALUES ('2006-01-05','BUY','RHAT',100,35.14)")
# to select all column we will use
statement = '''SELECT * FROM news'''
cursor.execute(statement)
print("All the data")
output = cursor.fetchall()
for row in output:
print(row)
# Save (commit) the changes
conn.commit()
# Close the connection
conn.close()
\ No newline at end of file
import spacy import spacy
import math import math
from collections import Counter, defaultdict
from shared.translate import translate from shared.translate import translate
from nltk.collocations import TrigramCollocationFinder
from nltk.metrics import TrigramAssocMeasures
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
#page = '专精特新”企业,是指具有专业化、精细化、特色化、新颖化四大特征的中小企业。创新是这类企业的灵魂,足够的 研发费用投入则是开展创新的重要保障。许多尚处在成长期的“专精特新”企业,近期普遍遭遇“钱紧”难题。如何集聚 更多的资金投入研发、保持创新领先地位是这些企业近来面临的最大烦恼。“作为一家新材料研发公司,创新是我们发展的重要驱动力,只有研发投入的不断加码,企业创新发展的步伐才不会 降速。”浙江省“专精特新”企业、宁波创润新材料有限公司董事长吴景晖说,过去3年,企业在研发投入方面不遗余力,累计投入2500万元,这对企业来说不是个小数目。 今年新兴市场的研发需求十分迫切,我们一直想加快 超高纯钛及钛合金中试生产线项目 的研发进度,但苦于资金不 足。令人高兴的是,今年4月340万元存量增值税留抵税额的到账,有效缓解了企业的资金压力,加快了企业的研发 进度。”吴景晖说,目前,“超高纯钛及钛合金中试生产线项目”正在有序推进,一旦投产将缓解半导体产业的高纯钛原材料供应不足问题,提升国产溅射靶材的市场竞争力' #page = '专精特新”企业,是指具有专业化、精细化、特色化、新颖化四大特征的中小企业。创新是这类企业的灵魂,足够的 研发费用投入则是开展创新的重要保障。许多尚处在成长期的“专精特新”企业,近期普遍遭遇“钱紧”难题。如何集聚 更多的资金投入研发、保持创新领先地位是这些企业近来面临的最大烦恼。“作为一家新材料研发公司,创新是我们发展的重要驱动力,只有研发投入的不断加码,企业创新发展的步伐才不会 降速。”浙江省“专精特新”企业、宁波创润新材料有限公司董事长吴景晖说,过去3年,企业在研发投入方面不遗余力,累计投入2500万元,这对企业来说不是个小数目。 今年新兴市场的研发需求十分迫切,我们一直想加快 超高纯钛及钛合金中试生产线项目 的研发进度,但苦于资金不 足。令人高兴的是,今年4月340万元存量增值税留抵税额的到账,有效缓解了企业的资金压力,加快了企业的研发 进度。”吴景晖说,目前,“超高纯钛及钛合金中试生产线项目”正在有序推进,一旦投产将缓解半导体产业的高纯钛原材料供应不足问题,提升国产溅射靶材的市场竞争力'
# Step 4: Calculate PMI for all Bigrams
def calculate_pmi(bigram, p_bigram, p_word):
word1, word2 = bigram
return math.log2(max(p_bigram[bigram], 1e-10) / (max(p_word[word1], 1e-10) * max(p_word[word2], 1e-10))) def clean(text):
def escape(token: str): text = text.replace('<p>', ' ')
token = token.replace("&", " ") text = text.replace('</p>', ' ')
token = token.replace("-", " ") text = text.replace('<br>', ' ')
token = token.replace("<", " ") text = text.replace('</br>', ' ')
token = token.replace(">", " ") text = text.replace('><', ' ')
token = token.replace("\"", " ") text = text.replace('\u3000', ' ')
token = token.replace("'", " ") text = text.replace('br', ' ')
token = token.strip() cltext = text.replace('\n', ' ').strip()
return token return str(cltext)
def run_collocation_on_text(page): def run_collocation_on_text(page):
page = clean(page)
corpus = [] corpus = []
collocations = []
nlp = spacy.load('zh_core_web_sm') nlp = spacy.load('zh_core_web_sm')
doc = nlp(page) doc = nlp(page)
for token in doc: for token in doc:
if not token.is_stop: if not token.is_stop:
# print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,token.shape_, token.is_alpha, token.is_stop)
corpus.append(token.text.lower())
#print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,token.shape_, token.is_alpha, token.is_stop)
corpus.append(escape(token.text.lower()))
# Step 2: Calculate Frequencies
word_freq = Counter(corpus)
bigram_freq = Counter(zip(corpus[:-1], corpus[1:]))
# Step 3: Calculate Probabilities biagram_collocation = BigramCollocationFinder.from_words(corpus)
total_words = len(corpus) #biagram_collocation.apply_freq_filter(3)
p_word = defaultdict(float) trigram_collocation = TrigramCollocationFinder.from_words(corpus)
p_bigram = defaultdict(float) #trigram_collocation.apply_freq_filter(3)
for word, freq in word_freq.items(): scoredbigrams = biagram_collocation.score_ngrams(BigramAssocMeasures().likelihood_ratio)[:10]
p_word[word] = freq / total_words
total_bigrams = len(corpus) - 1 scoretrigrams = trigram_collocation.score_ngrams(TrigramAssocMeasures().likelihood_ratio)[:10]
all_pmi_scores = [] allscores = scoredbigrams+scoretrigrams
for bigram, freq in bigram_freq.items(): for item in allscores:
p_bigram[bigram] = freq / total_bigrams itemstr = " ".join(i for i in item[0])
bigramstr = bigram[0]+' '+bigram[1] translation = translate(itemstr).text.lower()
translation = translate(bigramstr).text.lower() score = item[1]/1000000
pmi = calculate_pmi(bigram, p_bigram, p_word) collocations.append({"0 Term": itemstr,"1 Translation":translation ,"2 LogRatio": score})
all_pmi_scores.append({"0 Term": bigramstr,"1 Translation":translation ,"2 PMI Score": round(pmi,3)})
collocations = sorted(collocations, key=lambda x: x["2 LogRatio"], reverse=True)
all_pmi_score = sorted(all_pmi_scores, key=lambda x: x["2 PMI Score"], reverse=True) result = {'output': collocations, 'message': 'Done', 'code': 'SUCCESS'}
all_pmi_score = all_pmi_score[slice(40)]
result = {'output': all_pmi_score, 'message': 'Done', 'code': 'SUCCESS'}
print("PMI Scores:", all_pmi_scores)
return result return result
...@@ -5,77 +5,67 @@ from shared.translate import translate ...@@ -5,77 +5,67 @@ from shared.translate import translate
from wasabi import Printer from wasabi import Printer
from spacy.matcher import PhraseMatcher from spacy.matcher import PhraseMatcher
import re import re
from nltk.collocations import TrigramCollocationFinder
from nltk.metrics import TrigramAssocMeasures
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
#page = '专精特新”企业,是指具有专业化、精细化、特色化、新颖化四大特征的中小企业。创新是这类企业的灵魂,足够的 研发费用投入则是开展创新的重要保障。许多尚处在成长期的“专精特新”企业,近期普遍遭遇“钱紧”难题。如何集聚 更多的资金投入研发、保持创新领先地位是这些企业近来面临的最大烦恼。“作为一家新材料研发公司,创新是我们发展的重要驱动力,只有研发投入的不断加码,企业创新发展的步伐才不会 降速。”浙江省“专精特新”企业、宁波创润新材料有限公司董事长吴景晖说,过去3年,企业在研发投入方面不遗余力,累计投入2500万元,这对企业来说不是个小数目。 今年新兴市场的研发需求十分迫切,我们一直想加快 超高纯钛及钛合金中试生产线项目 的研发进度,但苦于资金不 足。令人高兴的是,今年4月340万元存量增值税留抵税额的到账,有效缓解了企业的资金压力,加快了企业的研发 进度。”吴景晖说,目前,“超高纯钛及钛合金中试生产线项目”正在有序推进,一旦投产将缓解半导体产业的高纯钛原材料供应不足问题,提升国产溅射靶材的市场竞争力' #page = '专精特新”企业,是指具有专业化、精细化、特色化、新颖化四大特征的中小企业。创新是这类企业的灵魂,足够的 研发费用投入则是开展创新的重要保障。许多尚处在成长期的“专精特新”企业,近期普遍遭遇“钱紧”难题。如何集聚 更多的资金投入研发、保持创新领先地位是这些企业近来面临的最大烦恼。“作为一家新材料研发公司,创新是我们发展的重要驱动力,只有研发投入的不断加码,企业创新发展的步伐才不会 降速。”浙江省“专精特新”企业、宁波创润新材料有限公司董事长吴景晖说,过去3年,企业在研发投入方面不遗余力,累计投入2500万元,这对企业来说不是个小数目。 今年新兴市场的研发需求十分迫切,我们一直想加快 超高纯钛及钛合金中试生产线项目 的研发进度,但苦于资金不 足。令人高兴的是,今年4月340万元存量增值税留抵税额的到账,有效缓解了企业的资金压力,加快了企业的研发 进度。”吴景晖说,目前,“超高纯钛及钛合金中试生产线项目”正在有序推进,一旦投产将缓解半导体产业的高纯钛原材料供应不足问题,提升国产溅射靶材的市场竞争力'
def calculate_pmi(bigram, p_bigram, p_word):
word1, word2 = bigram
return math.log2(max(p_bigram[bigram], 1e-10) / (max(p_word[word1], 1e-10) * max(p_word[word2], 1e-10)))
def escape(token: str): def clean(text):
token = token.replace("&", " ")
token = token.replace("-", " ")
token = token.replace("<", " ")
token = token.replace(">", " ")
token = token.replace("\"", " ")
token = token.replace("'", " ")
token = token.strip()
return token
def collocations(doc): text = text.replace('<p>', ' ')
text = text.replace('</p>', ' ')
text = text.replace('<br>', ' ')
text = text.replace('</br>', ' ')
text = text.replace('><', ' ')
text = text.replace('\u3000', ' ')
text = text.replace('br', ' ')
text = text.replace('——', '')
text = text.replace('[38;5;1m', '')
text = text.replace('[0m','')
cltext = text.replace('\n', ' ').strip()
return str(cltext)
def collocations(doc):
corpus = [] corpus = []
collocations = []
for token in doc: for token in doc:
if not token.is_stop:
corpus.append(escape(token.text.lower()))
# Step 2: Calculate Frequencies if not token.is_stop:
word_freq = Counter(corpus) # print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,token.shape_, token.is_alpha, token.is_stop)
bigram_freq = Counter(zip(corpus[:-1], corpus[1:])) corpus.append(token.text.lower())
# Step 3: Calculate Probabilities biagram_collocation = BigramCollocationFinder.from_words(corpus)
total_words = len(corpus) # biagram_collocation.apply_freq_filter(3)
p_word = defaultdict(float) trigram_collocation = TrigramCollocationFinder.from_words(corpus)
p_bigram = defaultdict(float) # trigram_collocation.apply_freq_filter(3)
for word, freq in word_freq.items(): scoredbigrams = biagram_collocation.score_ngrams(BigramAssocMeasures().likelihood_ratio)[:10]
p_word[word] = freq / total_words
total_bigrams = len(corpus) - 1 scoretrigrams = trigram_collocation.score_ngrams(TrigramAssocMeasures().likelihood_ratio)[:10]
all_pmi_scores = [] allscores = scoredbigrams + scoretrigrams
for bigram, freq in bigram_freq.items(): for item in allscores:
p_bigram[bigram] = freq / total_bigrams itemstr = " ".join(i for i in item[0])
bigramstr = bigram[0] + ' ' + bigram[1] translation = translate(itemstr).text.lower()
score = item[1] / 1000000
collocations.append({"0 Term": itemstr, "1 Translation": translation, "2 LogRatio": score})
pmi = calculate_pmi(bigram, p_bigram, p_word) collocations = sorted(collocations, key=lambda x: x["2 LogRatio"], reverse=True)
all_pmi_scores.append({"0 Term": bigramstr, "2 PMI Score": round(pmi, 3)})
all_pmi_score = sorted(all_pmi_scores, key=lambda x: x["2 PMI Score"], reverse=True) terms = [item.get('0 Term') for item in collocations]
all_pmi_score = all_pmi_score[slice(40)]
terms = [item.get('0 Term') for item in all_pmi_score]
return terms return terms
def clean(text):
ansi_escape = re.compile(r'''
\x1B # ESC
(?: # 7-bit C1 Fe (except CSI)
[@-Z\\-_]
| # or [ for CSI, followed by a control sequence
\[
[0-?]* # Parameter bytes
[ -/]* # Intermediate bytes
[@-~] # Final byte
)
''', re.VERBOSE)
cltext = ansi_escape.sub('', text)
return str(cltext)
def run_concordance_on_text(page): def run_concordance_on_text(page):
page = clean(page)
print('Page')
print(page)
nlp = spacy.load('zh_core_web_sm') nlp = spacy.load('zh_core_web_sm')
doc = nlp(page) doc = nlp(page)
terms = collocations(doc) terms = collocations(doc)
...@@ -87,24 +77,29 @@ def run_concordance_on_text(page): ...@@ -87,24 +77,29 @@ def run_concordance_on_text(page):
matches = matcher(doc) matches = matcher(doc)
match = Printer() match = Printer()
for i, start, end in matches: for i, start, end in matches:
perecedingSlice = clean(doc[start - 7: start].text) perecedingSlice = doc[start - 20: start].text
if '' in perecedingSlice:
perecedingSlice = perecedingSlice.split('')[1]
else:
perecedingSlice = perecedingSlice.strip()
perecedingSliceTr = clean(translate(doc[start - 7: start]).text) #perecedingSliceTr = clean(translate(doc[start - 20: start]).text)
matchedTerm = clean(match.text(doc[start:end].text, color='red', no_print=True)) matchedTerm = doc[start:end].text
print(matchedTerm)
#matchedTerm = doc[start:end].text #matchedTerm = doc[start:end].text
matchedTermTr = clean(match.text(translate(doc[start:end].text).text, color='red', no_print=True)) matchedTermTr = match.text(translate(doc[start:end].text).text, color='red', no_print=True)
#matchedTermTr = match.text(translate(doc[start:end].text).text) #matchedTermTr = match.text(translate(doc[start:end].text).text)
followingSlice = clean(doc[end:end + 7].text) followingSlice = doc[end:end + 20].text
followingSliceTr = clean(translate(doc[end:end + 7]).text) #followingSliceTr = clean(translate(doc[end:end + 20]).text)
context = perecedingSlice+', '+matchedTerm+', '+followingSlice #context = perecedingSlice+', '+matchedTerm+', '+followingSlice
contextTr = perecedingSliceTr+', '+matchedTermTr+', '+followingSliceTr #contextTr = perecedingSliceTr+', '+matchedTermTr+', '+followingSliceTr
#concordances.append({"0 Term": escapeAnscii(matchedTerm), "1 Eng": escapeAnscii(matchedTermTr), "2 Context":escapeAnscii(context), "3 Context Eng":escapeAnscii(contextTr)}) #concordances.append({"0 Term": escapeAnscii(matchedTerm), "1 Eng": escapeAnscii(matchedTermTr), "2 Context":escapeAnscii(context), "3 Context Eng":escapeAnscii(contextTr)})
concordances.append({"0 Term": matchedTerm, "1 Eng": matchedTermTr,"2 Context": context, "3 Context Eng": contextTr}) concordances.append({"0 Preceded By":perecedingSlice,"1 Term": matchedTerm, "2 Followed By": followingSlice})
print(concordances)
result = {'output': concordances, 'message': 'Done', 'code': 'SUCCESS'} result = {'output': concordances, 'message': 'Done', 'code': 'SUCCESS'}
return result return result
......
import pandas as pd
from db.db_config import get_db
#page = '尼罗河 是一条流經非洲東部與北部的河流,與中非地區的剛果河、非洲南部的赞比西河以及西非地区的尼日尔河並列非洲最大的四個河流系統。'
# Perform NER on Text
def run_multidatasets():
conn, cursor = get_db()
cursor.execute('SELECT * from news;')
res = cursor.fetchall()
data = []
for row in res:
print(row)
data.append({"0 Id": row[0], "1 Title": row[1], "2 Date": row[4],"3 Content":row[-1]})
result = {'output': data,'message': 'Done', 'code': 'SUCCESS'}
return result
from ckip_transformers.nlp import CkipNerChunker
from db.db_config import get_db
#page = '尼罗河 是一条流經非洲東部與北部的河流,與中非地區的剛果河、非洲南部的赞比西河以及西非地区的尼日尔河並列非洲最大的四個河流系統。'
# Perform NER on Text
def run_neroverall_on_text(page):
ner_driver = CkipNerChunker(model="bert-base")
conn, cursor = get_db()
cursor.execute('SELECT * from news;')
res = cursor.fetchall()
data = []
for row in res:
docid = row[0]
content = row[-1].replace('\n', ' ').replace('\t', ' ')
data.append([docid, content])
ner_with_count = []
for i in range(0,len(data)):
id = data[i][0]
txt = data[i][1]
ner = ner_driver([txt])
tags = []
for item in ner[0]:
word = item.word
ner = item.ner
tags.append(word + '__' + ner)
ners = []
seen_words = []
seen_tags = []
for tag in tags:
if tag not in seen_words:
ner = tag.split('__')[1].strip()
ners.append(ner)
seen_words.append(tag)
for n in ners:
if n not in seen_tags:
freq = ners.count(n) / 1000000
ner_with_count.append({"0 Doc Id": id, "1 NER": n, "2 Frequency": freq})
seen_tags.append(n)
#nerall = sorted(ner_with_count, key=lambda x: (x["0 Doc Id"],x["2 Frequency"]), reverse=True)
result = {'output': ner_with_count,'message': 'Done', 'code': 'SUCCESS'}
return result
...@@ -14,6 +14,7 @@ def run_usas_on_text(page): ...@@ -14,6 +14,7 @@ def run_usas_on_text(page):
val = lineL[1].strip() val = lineL[1].strip()
d[key] = val d[key] = val
# We exclude the following components as we do not need them. # We exclude the following components as we do not need them.
nlp = spacy.load('zh_core_web_sm', exclude=['parser', 'ner']) nlp = spacy.load('zh_core_web_sm', exclude=['parser', 'ner'])
# Load the Chinese PyMUSAS rule-based tagger in a separate spaCy pipeline # Load the Chinese PyMUSAS rule-based tagger in a separate spaCy pipeline
...@@ -31,6 +32,7 @@ def run_usas_on_text(page): ...@@ -31,6 +32,7 @@ def run_usas_on_text(page):
idx = (start, end) idx = (start, end)
for el in token._.pymusas_tags: for el in token._.pymusas_tags:
el = el.split('.')[0]
#obj = {"word": token.text, "USAS Tags": el, "idx": idx} #obj = {"word": token.text, "USAS Tags": el, "idx": idx}
tags.append(el) tags.append(el)
#data.append(obj) #data.append(obj)
...@@ -51,3 +53,4 @@ def run_usas_on_text(page): ...@@ -51,3 +53,4 @@ def run_usas_on_text(page):
return result return result
...@@ -71,3 +71,21 @@ def concordance(): ...@@ -71,3 +71,21 @@ def concordance():
result = get_concordance_for_data(page) result = get_concordance_for_data(page)
return result return result
@app.route("/multidatasets", methods=['POST'])
def multidatasets():
#request_data = request.get_json()
#page = request_data['page']
result = run_multidatasets_all()
return result
@app.route("/neroverall", methods=['POST'])
def neroverall():
request_data = request.get_json()
page = request_data['page']
result = run_neroverall_all(page)
return result
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment