Skip to content
Snippets Groups Projects
Commit 03303cee authored by Thomas Edwards's avatar Thomas Edwards
Browse files

demo branch

parent 11eb0225
Branches pleaseSQLWork
No related tags found
1 merge request!1Demo branch
No preview for this file type
...@@ -11,6 +11,7 @@ from db.db_config import get_db ...@@ -11,6 +11,7 @@ from db.db_config import get_db
def run_collocation_on_text(page): def run_collocation_on_text(page):
datasetid = page.split('><p>')[0].replace('<div id=', '').replace('"', '').strip() datasetid = page.split('><p>')[0].replace('<div id=', '').replace('"', '').strip()
print('dataset id in run_collocation_on_text: ',datasetid)
collocations = [] collocations = []
nlp = spacy.load('zh_core_web_sm') nlp = spacy.load('zh_core_web_sm')
...@@ -22,6 +23,7 @@ def run_collocation_on_text(page): ...@@ -22,6 +23,7 @@ def run_collocation_on_text(page):
data = [] data = []
for row in res: for row in res:
print('row in collocation from db: ',row)
docid = row[0] docid = row[0]
......
...@@ -8,12 +8,13 @@ from nltk.metrics import BigramAssocMeasures ...@@ -8,12 +8,13 @@ from nltk.metrics import BigramAssocMeasures
def collocations(): def collocations(datasetid):
collocations = [] collocations = []
nlp = spacy.load('zh_core_web_sm') nlp = spacy.load('zh_core_web_sm')
conn, cursor = get_db() conn, cursor = get_db()
cursor.execute('SELECT * from news;') #cursor.execute('SELECT * from news;')
cursor.execute('SELECT * from files where dataset_id = "' + datasetid + '";')
res = cursor.fetchall() res = cursor.fetchall()
data = [] data = []
...@@ -65,11 +66,13 @@ def collocations(): ...@@ -65,11 +66,13 @@ def collocations():
def run_concordance_on_text(page): def run_concordance_on_text(page):
#print('page: ',page) datasetid = page.replace("<p>Collocations for the word '' (department) for ",'').replace('</p>','').strip()
page = page+'' print('datasetid inside run_concordance_on_text: ',datasetid)
#page = page+'部'
nlp = spacy.load('zh_core_web_sm') nlp = spacy.load('zh_core_web_sm')
conn, cursor = get_db() conn, cursor = get_db()
cursor.execute('SELECT * from news;') #cursor.execute('SELECT * from news;')
cursor.execute('SELECT * from files where dataset_id = "' + datasetid + '";')
res = cursor.fetchall() res = cursor.fetchall()
data = [] data = []
for row in res: for row in res:
...@@ -78,7 +81,7 @@ def run_concordance_on_text(page): ...@@ -78,7 +81,7 @@ def run_concordance_on_text(page):
data.append([docid, content]) data.append([docid, content])
concordances = [] concordances = []
terms = collocations() terms = collocations(datasetid)
#terms = [page] #terms = [page]
for i in range(0, len(data)): for i in range(0, len(data)):
......
...@@ -9,7 +9,7 @@ def run_neroverall_on_text(page): ...@@ -9,7 +9,7 @@ def run_neroverall_on_text(page):
print('neroverall page old: ',page) print('neroverall page old: ',page)
datasetid = page.split('><p>')[0].replace('<div id=','').replace('"','').strip() datasetid = page.split('><p>')[0].replace('<div id=','').replace('"','').strip()
#output_json = html_to_json.convert_tables(page) #output_json = html_to_json.convert_tables(page)
print('neroverall page: ',page) print('neroverall datasetid: ',datasetid)
ner_driver = CkipNerChunker(model="bert-base") ner_driver = CkipNerChunker(model="bert-base")
conn, cursor = get_db() conn, cursor = get_db()
......
...@@ -9,6 +9,7 @@ from db.db_config import get_db ...@@ -9,6 +9,7 @@ from db.db_config import get_db
def run_usas_on_text(page): def run_usas_on_text(page):
print('usasoverall page old: ', page) print('usasoverall page old: ', page)
datasetid = page.split('><p>')[0].replace('<div id=', '').replace('"', '').strip() datasetid = page.split('><p>')[0].replace('<div id=', '').replace('"', '').strip()
print('usas overall datasetid: ', datasetid)
d = {} d = {}
with open('func/usas/usas_overall.txt') as f: with open('func/usas/usas_overall.txt') as f:
for line in f: for line in f:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment