diff --git a/db/datasets.db b/db/datasets.db index e25a539924a328ecd136ce8770ddc7cebc8b941e..8489e3980dc54d847378b5dff7902c70befe1462 100644 Binary files a/db/datasets.db and b/db/datasets.db differ diff --git a/func/collocation/collocation.py b/func/collocation/collocation.py index b6c12e466ee341b346972b8227216bf74daf9922..c003be77e5f8a1a2a44a27d43898316836163449 100644 --- a/func/collocation/collocation.py +++ b/func/collocation/collocation.py @@ -11,6 +11,7 @@ from db.db_config import get_db def run_collocation_on_text(page): datasetid = page.split('><p>')[0].replace('<div id=', '').replace('"', '').strip() + print('dataset id in run_collocation_on_text: ',datasetid) collocations = [] nlp = spacy.load('zh_core_web_sm') @@ -22,6 +23,7 @@ def run_collocation_on_text(page): data = [] for row in res: + print('row in collocation from db: ',row) docid = row[0] diff --git a/func/concordance/concordance.py b/func/concordance/concordance.py index 5a0fe246bb643fc98e302fd7a7296e7ec339c8d2..71e7c12b254d296e1dbdbcfe3a808bbbbfb4e7c1 100644 --- a/func/concordance/concordance.py +++ b/func/concordance/concordance.py @@ -8,12 +8,13 @@ from nltk.metrics import BigramAssocMeasures -def collocations(): +def collocations(datasetid): collocations = [] nlp = spacy.load('zh_core_web_sm') conn, cursor = get_db() - cursor.execute('SELECT * from news;') + #cursor.execute('SELECT * from news;') + cursor.execute('SELECT * from files where dataset_id = "' + datasetid + '";') res = cursor.fetchall() data = [] @@ -65,11 +66,13 @@ def collocations(): def run_concordance_on_text(page): - #print('page: ',page) - page = page+'部' + datasetid = page.replace("<p>Collocations for the word '部' (department) for ",'').replace('</p>','').strip() + print('datasetid inside run_concordance_on_text: ',datasetid) + #page = page+'部' nlp = spacy.load('zh_core_web_sm') conn, cursor = get_db() - cursor.execute('SELECT * from news;') + #cursor.execute('SELECT * from news;') + cursor.execute('SELECT * from files where dataset_id = "' + datasetid + '";') res = cursor.fetchall() data = [] for row in res: @@ -78,7 +81,7 @@ def run_concordance_on_text(page): data.append([docid, content]) concordances = [] - terms = collocations() + terms = collocations(datasetid) #terms = [page] for i in range(0, len(data)): diff --git a/func/neroverall/neroverall.py b/func/neroverall/neroverall.py index 491237dfd60758b2f00f511b5d5582f44010b0d4..ff06fc22b562cf0581c717dc4dece3e663206ded 100644 --- a/func/neroverall/neroverall.py +++ b/func/neroverall/neroverall.py @@ -9,7 +9,7 @@ def run_neroverall_on_text(page): print('neroverall page old: ',page) datasetid = page.split('><p>')[0].replace('<div id=','').replace('"','').strip() #output_json = html_to_json.convert_tables(page) - print('neroverall page: ',page) + print('neroverall datasetid: ',datasetid) ner_driver = CkipNerChunker(model="bert-base") conn, cursor = get_db() diff --git a/func/usas/usas.py b/func/usas/usas.py index 083b3217bb6b5a21e6c3470ec49b26061e19c225..b961022e488ed3fe3f6600a1372071b147a95cd6 100644 --- a/func/usas/usas.py +++ b/func/usas/usas.py @@ -9,6 +9,7 @@ from db.db_config import get_db def run_usas_on_text(page): print('usasoverall page old: ', page) datasetid = page.split('><p>')[0].replace('<div id=', '').replace('"', '').strip() + print('usas overall datasetid: ', datasetid) d = {} with open('func/usas/usas_overall.txt') as f: for line in f: