Skip to content
Snippets Groups Projects
Commit 844466c3 authored by Thomas Edwards's avatar Thomas Edwards
Browse files

build with all functions working on a dynamic dataset

parent 75609fe7
Branches All-functions-working
No related tags found
1 merge request!1Demo branch
This commit is part of merge request !1. Comments created here will be created in the context of that merge request.
Showing with 469 additions and 19 deletions
......@@ -8,5 +8,20 @@
<jdbc-url>jdbc:sqlite:$PROJECT_DIR$/db/data.db</jdbc-url>
<working-dir>$ProjectFileDir$</working-dir>
</data-source>
<data-source source="LOCAL" name="datasets" uuid="60936a36-cd03-415e-8d9f-d2cf25edcba4">
<driver-ref>sqlite.xerial</driver-ref>
<synchronize>true</synchronize>
<jdbc-driver>org.sqlite.JDBC</jdbc-driver>
<jdbc-url>jdbc:sqlite:$PROJECT_DIR$/db/datasets.db</jdbc-url>
<working-dir>$ProjectFileDir$</working-dir>
<libraries>
<library>
<url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/xerial/sqlite-jdbc/3.45.1.0/sqlite-jdbc-3.45.1.0.jar</url>
</library>
<library>
<url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/slf4j/slf4j-api/1.7.36/slf4j-api-1.7.36.jar</url>
</library>
</libraries>
</data-source>
</component>
</project>
\ No newline at end of file
......@@ -9,6 +9,10 @@ from func.concordance.concordance import *
from func.mutlidatasets.multidatasets import *
from func.neroverall.neroverall import *
from func.usasFine.usasFine import *
from func.upload.upload import *
from func.getdataset.getdataset import *
from func.getfiles.getfiles import *
from func.getallids.getallids import *
# Perform NER on a file
# TAKES XML text page
......@@ -97,4 +101,35 @@ def run_neroverall_all(page):
return make_response(jsonify(result), 400)
def run_upload_all(page):
result = upload_file(page)
if result["code"] == "SUCCESS":
return make_response(jsonify(result), 201)
return make_response(jsonify(result), 400)
def get_dataset_all(page):
result = get_dataset(page)
if result["code"] == "SUCCESS":
return make_response(jsonify(result), 201)
return make_response(jsonify(result), 400)
def get_files_all(page):
result = get_files(page)
if result["code"] == "SUCCESS":
return make_response(jsonify(result), 201)
return make_response(jsonify(result), 400)
def get_dataset_ids():
result = get_all_dataset_ids()
if result["code"] == "SUCCESS":
return make_response(jsonify(result), 200)
return make_response(jsonify(result), 400)
\ No newline at end of file
File added
File added
......@@ -3,6 +3,6 @@ import sqlite3
def get_db():
conn = sqlite3.connect('/Users/tom/PycharmProjects/cognistance/db/data.db')
conn = sqlite3.connect('/Users/tom/PycharmProjects/cognistance/db/datasets.db')
cursor = conn.cursor()
return conn, cursor
\ No newline at end of file
import sqlite3
# Connect to an SQLite database (or create it if it doesn't exist)
conn = sqlite3.connect('data.db')
#conn = sqlite3.connect('data.db')
conn = sqlite3.connect('datasets.db')
# Create a cursor object using the cursor() method
cursor = conn.cursor()
# Create table
# Create table
cursor.execute('''CREATE TABLE IF NOT EXISTS news
(id integer primary key, title text, location text, pubyear text, pubdate text, websource text, loaddate text, content longtext)''')
#cursor.execute('''CREATE TABLE IF NOT EXISTS news
# (id integer primary key, title text, location text, pubyear text, pubdate text, websource text, loaddate text, content longtext)''')
cursor.execute('''CREATE TABLE IF NOT EXISTS datasets (id INTEGER PRIMARY KEY AUTOINCREMENT)''')
cursor.execute('''CREATE TABLE IF NOT EXISTS files (id INTEGER PRIMARY KEY AUTOINCREMENT, dataset_id INTEGER, filename TEXT, date TEXT, content TEXT, FOREIGN KEY(dataset_id) REFERENCES datasets(id))''')
# Insert a row of data
#cursor.execute("INSERT INTO stocks VALUES ('2006-01-05','BUY','RHAT',100,35.14)")
# to select all column we will use
statement = '''SELECT * FROM news'''
statement = '''SELECT * FROM datasets'''
cursor.execute(statement)
print("All the data")
......
......@@ -10,12 +10,13 @@ from db.db_config import get_db
def run_collocation_on_text(page):
datasetid = page.split('><p>')[0].replace('<div id=', '').replace('"', '').strip()
collocations = []
nlp = spacy.load('zh_core_web_sm')
conn, cursor = get_db()
cursor.execute('SELECT * from news;')
#cursor.execute('SELECT * from news;')
cursor.execute('SELECT * from files where dataset_id = "' + datasetid + '";')
res = cursor.fetchall()
data = []
......
from db.db_config import get_db
def get_all_dataset_ids():
"""Fetch all datasets and their associated files."""
conn, cursor = get_db()
cursor = conn.cursor()
cursor.execute("SELECT id FROM datasets")
datasets = cursor.fetchall()
dataset_list = []
for dataset in datasets:
#print('inside get_all_dataset_ids dataset', dataset)
dataset_id = dataset[0]
cursor.execute("SELECT filename, date FROM files WHERE dataset_id = ?", (dataset_id,))
files = cursor.fetchall()
dataset_list.append({
"dataset_id": dataset_id,
"files": [{"title": f[0], "date": f[1]} for f in files]
})
conn.close()
print('inside get_all_dataset_ids, dataset_list: ', dataset_list)
return {"datasets": dataset_list, "message": "Success", "code": "SUCCESS"}
from db.db_config import get_db
def get_dataset(page):
is_empty = page.get("is_empty", True)
dataset_id = page.get("dataset_id") # Check if dataset ID is provided
conn, cursor = get_db()
cursor = conn.cursor()
if is_empty and not dataset_id:
# Create a new dataset ID if table is empty and no ID exists
cursor.execute("INSERT INTO datasets DEFAULT VALUES")
dataset_id = cursor.lastrowid
elif not is_empty and dataset_id:
# If table is not empty and dataset ID exists, return it
pass # No need to fetch anything, just return the given dataset_id
else:
# If no dataset ID was provided, return the latest one in the database
cursor.execute("SELECT id FROM datasets ORDER BY id DESC LIMIT 1")
result = cursor.fetchone()
dataset_id = result[0] if result else None
conn.commit()
conn.close()
result = {"dataset_id": dataset_id, "message": "Done", "code": "SUCCESS"}
return result
from db.db_config import get_db
def get_files(dataset_id):
print('received page within get files: ',dataset_id)
# Read JSON body
if not dataset_id:
return {"message": "Missing dataset_id", "code": "ERROR"}, 400 # Return error if missing
conn, cursor = get_db()
cursor.execute("SELECT filename, date FROM files WHERE dataset_id = ?", (dataset_id,))
files = [{"title": row[0], "date": row[1]} for row in cursor.fetchall()]
print(f"Retrieved files for dataset_id {dataset_id}: {files}")
conn.close()
if not files:
print(f"No files found for dataset_id {dataset_id}") # Log warning
return {"files": [], "message": "No files found", "code": "EMPTY"}
return {"files": files, "message": "Success", "code": "SUCCESS"}
......@@ -10,9 +10,12 @@ from db.db_config import get_db
# Perform NER on Text
def run_ner_on_text(page):
print('NER tag: ',page)
nertag = page.split('__')[0]
datasetid = page.split('__')[1].split('><p>')[0].replace('<div id=','').replace('"','').strip()
ner_driver = CkipNerChunker(model="bert-base")
conn, cursor = get_db()
cursor.execute('SELECT * from news;')
#cursor.execute('SELECT * from files;')
cursor.execute('SELECT * from files where dataset_id = "' + datasetid + '";')
res = cursor.fetchall()
data = []
for row in res:
......@@ -42,7 +45,7 @@ def run_ner_on_text(page):
word = tag.split('__')[0]
ner = tag.split('__')[1]
#translation = translate(word).text
if ner == page:
if ner == nertag:
ner_words_with_count.append({"0 Word": word, "1 NER": ner, "2 Frequency": freq})
seen_words.append(tag)
......
from ckip_transformers.nlp import CkipNerChunker
from db.db_config import get_db
import html_to_json
#page = '尼罗河 是一条流經非洲東部與北部的河流,與中非地區的剛果河、非洲南部的赞比西河以及西非地区的尼日尔河並列非洲最大的四個河流系統。'
# Perform NER on Text
def run_neroverall_on_text(page):
print('neroverall page old: ',page)
datasetid = page.split('><p>')[0].replace('<div id=','').replace('"','').strip()
#output_json = html_to_json.convert_tables(page)
print('neroverall page: ',page)
ner_driver = CkipNerChunker(model="bert-base")
conn, cursor = get_db()
cursor.execute('SELECT * from news;')
cursor.execute('SELECT * from files where dataset_id = "'+datasetid+'";')
res = cursor.fetchall()
data = []
for row in res:
docid = row[0]
content = row[-1].replace('\n', ' ').replace('\t', ' ')
print('docid: ',docid)
print('content: ',content)
data.append([docid, content])
ner_with_count = []
......@@ -47,10 +55,11 @@ def run_neroverall_on_text(page):
seen_tags.append(n)
nerall = sorted(ner_with_count, key=lambda x: x["2 Frequency"], reverse=True)
result = {'output': nerall,'message': 'Done', 'code': 'SUCCESS'}
return result
......
......@@ -16,8 +16,9 @@ def zng(paragraph):
def run_sentiment_on_text(page):
datasetid = page.split('><p>')[0].replace('<div id=', '').replace('"', '').strip()
conn, cursor = get_db()
cursor.execute('SELECT * from news;')
cursor.execute('SELECT * from files where dataset_id = "'+datasetid+'";')
res = cursor.fetchall()
data = []
sentiments = []
......
from db.db_config import get_db
def upload_file(page):
#filename = page.get('filename')
#last_modified = page.get('lastModified')
content = page.get('content')
dataset_id = page.get("dataset_id")
file_info = page.get("file")
conn, cursor = get_db()
cursor.execute("INSERT INTO files (dataset_id, filename, date, content) VALUES (?, ?, ?, ?)",
(dataset_id, file_info["filename"], file_info["lastModified"], file_info["content"]))
conn.commit()
conn.close()
result = {'output': content, 'message': 'Done', 'code': 'SUCCESS'}
return result
......@@ -7,6 +7,8 @@ from db.db_config import get_db
def run_usas_on_text(page):
print('usasoverall page old: ', page)
datasetid = page.split('><p>')[0].replace('<div id=', '').replace('"', '').strip()
d = {}
with open('/Users/tom/PycharmProjects/cognistance/func/usas/usas_overall.txt') as f:
for line in f:
......@@ -24,7 +26,8 @@ def run_usas_on_text(page):
nlp.add_pipe('pymusas_rule_based_tagger', source=chinese_tagger_pipeline)
conn, cursor = get_db()
cursor.execute('SELECT * from news;')
#cursor.execute('SELECT * from news;')
cursor.execute('SELECT * from files where dataset_id = "'+datasetid+'";')
res = cursor.fetchall()
data = []
......
A general and abstract terms
B the body and the individual
C arts and crafts
E emotion
F food and farming
G government and public
H architecture, housing and the home
I money and commerce in industry
K entertainment, sports and games
L life and living things
M movement, location, travel and transport
N numbers and measurement
O substances, materials, objects and equipment
P education
Q language and communication
S social actions, states and processes
T Time
W world and environment
X psychological actions, states and processes
Y science and technology
Z names and grammar
\ No newline at end of file
......@@ -8,6 +8,8 @@ from db.db_config import get_db
def run_usasFine_on_text(page):
print('tag: ',page)
usastag = page.split('__')[0]
datasetid = page.split('__')[1].split('><p>')[0].replace('<div id=', '').replace('"', '').strip()
d = {}
with open('/Users/tom/PycharmProjects/cognistance/func/usas/usas_desc.txt') as f:
for line in f:
......@@ -25,11 +27,13 @@ def run_usasFine_on_text(page):
nlp.add_pipe('pymusas_rule_based_tagger', source=chinese_tagger_pipeline)
conn, cursor = get_db()
cursor.execute('SELECT * from news;')
#cursor.execute('SELECT * from files;')
cursor.execute('SELECT * from files where dataset_id = "' + datasetid + '";')
res = cursor.fetchall()
data = []
for row in res:
print('row in usas fine: ',row)
docid = row[0]
content = row[-1].replace('\n', ' ').replace('\t', ' ')
data.append([docid, content])
......@@ -47,7 +51,7 @@ def run_usasFine_on_text(page):
for token in output_doc:
start, end = token._.pymusas_mwe_indexes[0]
idx = (start, end)
#idx = (start, end)
for el in token._.pymusas_tags:
el = el.split('.')[0]
......@@ -66,7 +70,8 @@ def run_usasFine_on_text(page):
word = tag.split('__')[0]
usas = tag.split('__')[1]
if page in usas:
if usastag in usas:
print('before tag object: ',word)
tag_object = {"0 Word":word,"1 Discourse Field": usas, "2 Definition":d[usas],"3 Frequency": freq}
usas_tags_with_count.append(tag_object)
......
A1 GENERAL AND ABSTRACT TERMS
A1.1.1 General actions, making etc.
A1.1.2 Damaging and destroying
A1.2 Suitability
A1.3 Caution
A1.4 Chance, luck
A1.5 Use
A1.5.1 Using
A1.5.2 Usefulness
A1.6 Physical/mental
A1.7 Constraint
A1.8 Inclusion/Exclusion
A1.9 Avoiding
A2 Affect
A2.1 Affect:- Modify, change
A2.2 Affect:- Cause/Connected
A3 Being
A4 Classification
A4.1 Generally kinds, groups, examples
A4.2 Particular/general; detail
A5 Evaluation
A5.1 Evaluation:- Good/bad
A5.2 Evaluation:- True/false
A5.3 Evaluation:- Accuracy
A5.4 Evaluation:- Authenticity
A6 Comparing
A6.1 Comparing:- Similar/different
A6.2 Comparing:- Usual/unusual
A6.3 Comparing:- Variety
A7 Definite (+ modals)
A8 Seem
A9 Getting and giving; possession
A10 Open/closed; Hiding/Hidden; Finding; Showing
A11 Importance
A11.1 Importance: Important
A11.2 Importance: Noticeability
A12 Easy/difficult
A13 Degree
A13.1 Degree: Non-specific
A13.2 Degree: Maximizers
A13.3 Degree: Boosters
A13.4 Degree: Approximators
A13.5 Degree: Compromisers
A13.6 Degree: Diminishers
A13.7 Degree: Minimizers
A14 Exclusivizers/particularizers
A15 Safety/Danger
B1 Anatomy and physiology
B2 Health and disease
B3 medicines and medical treatment
B4 Cleaning and personal care
B5 Clothes and personal belongings
C1 Arts and crafts
E1 EMOTIONAL ACTIONS, STATES AND PROCESSES General
E2 Liking
E3 Calm/Violent/Angry
E4 Happy/sad
E4.1 Happy/sad: Happy
E4.2 Happy/sad: Contentment
E5 Fear/bravery/shock
E6 Worry, concern, confident
F1 Food
F2 Drinks
F3 Cigarettes and drugs
F4 Farming & Horticulture
G1 Government, Politics and elections
G1.1 Government etc.
G1.2 Politics
G2 Crime, law and order
G2.1 Crime, law and order: Law and order
G2.2 General ethics
G3 Warfare, defence and the army; weapons
H1 Architecture and kinds of houses and buildings
H2 Parts of buildings
H3 Areas around or near houses
H4 Residence
H5 Furniture and household fittings
I1 Money generally
I1.1 Money: Affluence
I1.2 Money: Debts
I1.3 Money: Price
I2 Business
I2.1 Business: Generally
I2.2 Business: Selling
I3 Work and employment
I3.1 Work and employment: Generally
I3.2 Work and employmeny: Professionalism
I4 Industry
K1 Entertainment generally
K2 Music and related activities
K3 Recorded sound etc.
K4 Drama, the theatre and showbusiness
K5 Sports and games generally
K5.1 Sports
K5.2 Games
K6 Childrens games and toys
L1 Life and living things
L2 Living creatures generally
L3 Plants
M1 Moving, coming and going
M2 Putting, taking, pulling, pushing, transporting &c.
M3 Vehicles and transport on land
M4 Shipping, swimming etc.
M5 Aircraft and flying
M6 Location and direction
M7 Places
M8 Remaining/stationary
N1 Numbers
N2 Mathematics
N3 Measurement
N3.1 Measurement: General
N3.2 Measurement: Size
N3.3 Measurement: Distance
N3.4 Measurement: Volume
N3.5 Measurement: Weight
N3.6 Measurement: Area
N3.7 Measurement: Length & height
N3.8 Measurement: Speed
N4 Linear order
N5 Quantities
N5.1 Entirety; maximum
N5.2 Exceeding; waste
N6 Frequency etc.
O1 Substances and materials generally
O1.1 Substances and materials generally: Solid
O1.2 Substances and materials generally: Liquid
O1.3 Substances and materials generally: Gas
O2 Objects generally
O3 Electricity and electrical equipment
O4 Physical attributes
O4.1 General appearance and physical properties
O4.2 Judgement of appearance (pretty etc.)
O4.3 Colour and colour patterns
O4.4 Shape
O4.5 Texture
O4.6 Temperature
P1 Education in general
Q1 LINGUISTIC ACTIONS, STATES AND PROCESSES; COMMUNICATION
Q1.1 LINGUISTIC ACTIONS, STATES AND PROCESSES; COMMUNICATION
Q1.2 Paper documents and writing
Q1.3 Telecommunications
Q2 Speech acts
Q2.1 Speech etc:- Communicative
Q2.2 Speech acts
Q3 Language, speech and grammar
Q4 The Media
Q4.1 The Media:- Books
Q4.2 The Media:- Newspapers etc.
Q4.3 The Media:- TV, Radio and Cinema
S1 SOCIAL ACTIONS, STATES AND PROCESSES
S1.1 SOCIAL ACTIONS, STATES AND PROCESSES
S1.1.1 SOCIAL ACTIONS, STATES AND PROCESSES
S1.1.2 Reciprocity
S1.1.3 Participation
S1.1.4 Deserve etc.
S1.2 Personality traits
S1.2.1 Approachability and Friendliness
S1.2.2 Avarice
S1.2.3 Egoism
S1.2.4 Politeness
S1.2.5 Toughness; strong/weak
S1.2.6 Sensible
S2 People
S2.1 People:- Female
S2.2 People:- Male
S3 Relationship
S3.1 Relationship: General
S3.2 Relationship: Intimate/sexual
S4 Kin
S5 Groups and affiliation
S6 Obligation and necessity
S7 Power relationship
S7.1 Power, organizing
S7.2 Respect
S7.3 Competition
S7.4 Permission
S8 Helping/hindering
S9 Religion and the supernatural
T1 Time
T1.1 Time: General
T1.1.1 Time: General: Past
T1.1.2 Time: General: Present; simultaneous
T1.1.3 Time: General: Future
T1.2 Time: Momentary
T1.3 Time: Period
T2 Time: Beginning and ending
T3 Time: Old, new and young; age
T4 Time: Early/late
W1 The universe
W2 Light
W3 Geographical terms
W4 Weather
W5 Green issues
X1 PSYCHOLOGICAL ACTIONS, STATES AND PROCESSES
X2 Mental actions and processes
X2.1 Thought, belief
X2.2 Knowledge
X2.3 Learn
X2.4 Investigate, examine, test, search
X2.5 Understand
X2.6 Expect
X3 Sensory
X3.1 Sensory:- Taste
X3.2 Sensory:- Sound
X3.3 Sensory:- Touch
X3.4 Sensory:- Sight
X3.5 Sensory:- Smell
X4 Mental object
X4.1 Mental object:- Conceptual object
X4.2 Mental object:- Means, method
X5 Attention
X5.1 Attention
X5.2 Interest/boredom/excited/energetic
X6 Deciding
X7 Wanting; planning; choosing
X8 Trying
X9 Ability
X9.1 Ability:- Ability, intelligence
X9.2 Ability:- Success and failure
Y1 Science and technology in general
Y2 Information technology and computing
Z0 Unmatched proper noun
Z1 Personal names
Z2 Geographical names
Z3 Other proper names
Z4 Discourse Bin
Z5 Grammatical bin
Z6 Negative
Z7 If
Z8 Pronouns etc.
Z9 Trash can
Z99 Unmatched
\ No newline at end of file
......@@ -96,4 +96,31 @@ def neroverall():
page = request_data['page']
result = run_neroverall_all(page)
return result
@app.route("/upload", methods=['POST'])
def upload():
request_data = request.get_json()
page = request_data['page']
result = run_upload_all(page)
return result
@app.route("/getdataset", methods=['POST'])
def getdataset():
request_data = request.get_json()
page = request_data['page']
result = get_dataset_all(page)
return result
@app.route("/getfiles", methods=['POST'])
def getfiles():
request_data = request.get_json()
page = request_data['dataset_id']
result = get_files_all(page)
return result
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment