build with all functions working on a dynamic dataset

844466c3 · Thomas Edwards · 75609fe7 · 844466c3 · 844466c3 · 844466c3
Commit 844466c3 authored 5 months ago by Thomas Edwards
--- a/.idea/dataSources.xml
+++ b/.idea/dataSources.xml
@@ -8,5 +8,20 @@
      <jdbc-url>jdbc:sqlite:$PROJECT_DIR$/db/data.db</jdbc-url>
      <working-dir>$ProjectFileDir$</working-dir>
    </data-source>
+    <data-source source="LOCAL" name="datasets" uuid="60936a36-cd03-415e-8d9f-d2cf25edcba4">
+      <driver-ref>sqlite.xerial</driver-ref>
+      <synchronize>true</synchronize>
+      <jdbc-driver>org.sqlite.JDBC</jdbc-driver>
+      <jdbc-url>jdbc:sqlite:$PROJECT_DIR$/db/datasets.db</jdbc-url>
+      <working-dir>$ProjectFileDir$</working-dir>
+      <libraries>
+        <library>
+          <url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/xerial/sqlite-jdbc/3.45.1.0/sqlite-jdbc-3.45.1.0.jar</url>
+        </library>
+        <library>
+          <url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/slf4j/slf4j-api/1.7.36/slf4j-api-1.7.36.jar</url>
+        </library>
+      </libraries>
+    </data-source>
  </component>
 </project>
\ No newline at end of file
--- a/api/api_functions.py
+++ b/api/api_functions.py
@@ -9,6 +9,10 @@ from func.concordance.concordance import *
 from func.mutlidatasets.multidatasets import *
 from func.neroverall.neroverall import *
 from func.usasFine.usasFine import *
+from func.upload.upload import *
+from func.getdataset.getdataset import *
+from func.getfiles.getfiles import *
+from func.getallids.getallids import *

 # Perform NER on a file
 # TAKES XML text page
@@ -97,4 +101,35 @@ def run_neroverall_all(page):

    return make_response(jsonify(result), 400)

+def run_upload_all(page):
+    result = upload_file(page)

+    if result["code"] == "SUCCESS":
+        return make_response(jsonify(result), 201)
+
+    return make_response(jsonify(result), 400)
+
+def get_dataset_all(page):
+    result = get_dataset(page)
+
+    if result["code"] == "SUCCESS":
+        return make_response(jsonify(result), 201)
+
+    return make_response(jsonify(result), 400)
+
+def get_files_all(page):
+    result = get_files(page)
+
+    if result["code"] == "SUCCESS":
+        return make_response(jsonify(result), 201)
+
+    return make_response(jsonify(result), 400)
+
+
+def get_dataset_ids():
+    result = get_all_dataset_ids()
+
+    if result["code"] == "SUCCESS":
+        return make_response(jsonify(result), 200)
+
+    return make_response(jsonify(result), 400)
\ No newline at end of file
--- a/db/data.db
+++ b/db/data.db
--- a/db/datasets.db
+++ b/db/datasets.db
--- a/db/db_config.py
+++ b/db/db_config.py
@@ -3,6 +3,6 @@ import sqlite3

 def get_db():

-    conn = sqlite3.connect('/Users/tom/PycharmProjects/cognistance/db/data.db')
+    conn = sqlite3.connect('/Users/tom/PycharmProjects/cognistance/db/datasets.db')
    cursor = conn.cursor()
    return conn, cursor
\ No newline at end of file
--- a/db/db_creation.py
+++ b/db/db_creation.py
 import sqlite3

 # Connect to an SQLite database (or create it if it doesn't exist)
-conn = sqlite3.connect('data.db')
+#conn = sqlite3.connect('data.db')
+conn = sqlite3.connect('datasets.db')

 # Create a cursor object using the cursor() method
 cursor = conn.cursor()

 # Create table
 # Create table
-cursor.execute('''CREATE TABLE IF NOT EXISTS news
-             (id integer primary key, title text, location text, pubyear text, pubdate text, websource text, loaddate text, content longtext)''')
-
-
+#cursor.execute('''CREATE TABLE IF NOT EXISTS news
+#             (id integer primary key, title text, location text, pubyear text, pubdate text, websource text, loaddate text, content longtext)''')

+cursor.execute('''CREATE TABLE IF NOT EXISTS datasets (id INTEGER PRIMARY KEY AUTOINCREMENT)''')
+cursor.execute('''CREATE TABLE IF NOT EXISTS files (id INTEGER PRIMARY KEY AUTOINCREMENT, dataset_id INTEGER, filename TEXT, date TEXT, content TEXT, FOREIGN KEY(dataset_id) REFERENCES datasets(id))''')
 # Insert a row of data
 #cursor.execute("INSERT INTO stocks VALUES ('2006-01-05','BUY','RHAT',100,35.14)")

 # to select all column we will use
-statement = '''SELECT * FROM news'''
+statement = '''SELECT * FROM datasets'''

 cursor.execute(statement)
 print("All the data")

--- a/func/collocation/collocation.py
+++ b/func/collocation/collocation.py
@@ -10,12 +10,13 @@ from db.db_config import get_db


 def run_collocation_on_text(page):
-
+    datasetid = page.split('><p>')[0].replace('<div id=', '').replace('"', '').strip()
    collocations = []

    nlp = spacy.load('zh_core_web_sm')
    conn, cursor = get_db()
-    cursor.execute('SELECT * from news;')
+    #cursor.execute('SELECT * from news;')
+    cursor.execute('SELECT * from files where dataset_id = "' + datasetid + '";')
    res = cursor.fetchall()

    data = []

--- a/func/getallids/getallids.py
+++ b/func/getallids/getallids.py
+from db.db_config import get_db
+
+def get_all_dataset_ids():
+    """Fetch all datasets and their associated files."""
+    conn, cursor = get_db()
+    cursor = conn.cursor()
+
+    cursor.execute("SELECT id FROM datasets")
+    datasets = cursor.fetchall()
+
+    dataset_list = []
+    for dataset in datasets:
+        #print('inside get_all_dataset_ids dataset', dataset)
+        dataset_id = dataset[0]
+        cursor.execute("SELECT filename, date FROM files WHERE dataset_id = ?", (dataset_id,))
+        files = cursor.fetchall()
+
+        dataset_list.append({
+            "dataset_id": dataset_id,
+            "files": [{"title": f[0], "date": f[1]} for f in files]
+        })
+
+
+
+    conn.close()
+    print('inside get_all_dataset_ids, dataset_list: ', dataset_list)
+    return {"datasets": dataset_list, "message": "Success", "code": "SUCCESS"}
--- a/func/getdataset/getdataset.py
+++ b/func/getdataset/getdataset.py
+from db.db_config import get_db
+
+def get_dataset(page):
+
+    is_empty = page.get("is_empty", True)
+    dataset_id = page.get("dataset_id")  # Check if dataset ID is provided
+
+    conn, cursor = get_db()
+    cursor = conn.cursor()
+
+    if is_empty and not dataset_id:
+        # Create a new dataset ID if table is empty and no ID exists
+        cursor.execute("INSERT INTO datasets DEFAULT VALUES")
+        dataset_id = cursor.lastrowid
+    elif not is_empty and dataset_id:
+        # If table is not empty and dataset ID exists, return it
+        pass  # No need to fetch anything, just return the given dataset_id
+    else:
+        # If no dataset ID was provided, return the latest one in the database
+        cursor.execute("SELECT id FROM datasets ORDER BY id DESC LIMIT 1")
+        result = cursor.fetchone()
+        dataset_id = result[0] if result else None
+
+    conn.commit()
+    conn.close()
+
+    result = {"dataset_id": dataset_id, "message": "Done", "code": "SUCCESS"}
+    return result
--- a/func/getfiles/getfiles.py
+++ b/func/getfiles/getfiles.py
+from db.db_config import get_db
+
+
+def get_files(dataset_id):
+    print('received page within get files: ',dataset_id)
+
+     # Read JSON body
+
+
+    if not dataset_id:
+        return {"message": "Missing dataset_id", "code": "ERROR"}, 400  # Return error if missing
+
+    conn, cursor = get_db()
+    cursor.execute("SELECT filename, date FROM files WHERE dataset_id = ?", (dataset_id,))
+    files = [{"title": row[0], "date": row[1]} for row in cursor.fetchall()]
+
+    print(f"Retrieved files for dataset_id {dataset_id}: {files}")
+    conn.close()
+
+    if not files:
+        print(f"No files found for dataset_id {dataset_id}")  # Log warning
+        return {"files": [], "message": "No files found", "code": "EMPTY"}
+
+    return {"files": files, "message": "Success", "code": "SUCCESS"}
--- a/func/ner/ner.py
+++ b/func/ner/ner.py
@@ -10,9 +10,12 @@ from db.db_config import get_db
 # Perform NER on Text
 def run_ner_on_text(page):
    print('NER tag: ',page)
+    nertag = page.split('__')[0]
+    datasetid = page.split('__')[1].split('><p>')[0].replace('<div id=','').replace('"','').strip()
    ner_driver = CkipNerChunker(model="bert-base")
    conn, cursor = get_db()
-    cursor.execute('SELECT * from news;')
+    #cursor.execute('SELECT * from files;')
+    cursor.execute('SELECT * from files where dataset_id = "' + datasetid + '";')
    res = cursor.fetchall()
    data = []
    for row in res:
@@ -42,7 +45,7 @@ def run_ner_on_text(page):
                word = tag.split('__')[0]
                ner = tag.split('__')[1]
                #translation = translate(word).text
-                if ner == page:
+                if ner == nertag:
                    ner_words_with_count.append({"0 Word": word, "1 NER": ner, "2 Frequency": freq})
            seen_words.append(tag)


--- a/func/neroverall/neroverall.py
+++ b/func/neroverall/neroverall.py
 from ckip_transformers.nlp import CkipNerChunker
 from db.db_config import get_db
+import html_to_json


 #page = '尼罗河 是一条流經非洲東部與北部的河流，與中非地區的剛果河、非洲南部的赞比西河以及西非地区的尼日尔河並列非洲最大的四個河流系統。'
 # Perform NER on Text
 def run_neroverall_on_text(page):
+    print('neroverall page old: ',page)
+    datasetid = page.split('><p>')[0].replace('<div id=','').replace('"','').strip()
+    #output_json = html_to_json.convert_tables(page)
+    print('neroverall page: ',page)
+
    ner_driver = CkipNerChunker(model="bert-base")
    conn, cursor = get_db()
-    cursor.execute('SELECT * from news;')
+    cursor.execute('SELECT * from files where dataset_id = "'+datasetid+'";')
    res = cursor.fetchall()
    data = []
    for row in res:
        docid = row[0]
        content = row[-1].replace('\n', ' ').replace('\t', ' ')
+        print('docid: ',docid)
+        print('content: ',content)
        data.append([docid, content])

    ner_with_count = []
@@ -47,10 +55,11 @@ def run_neroverall_on_text(page):
            seen_tags.append(n)

    nerall = sorted(ner_with_count, key=lambda x: x["2 Frequency"], reverse=True)
-
+    

    result = {'output': nerall,'message': 'Done', 'code': 'SUCCESS'}
-    
+
+
    return result



--- a/func/sentiment/sentiment.py
+++ b/func/sentiment/sentiment.py
@@ -16,8 +16,9 @@ def zng(paragraph):


 def run_sentiment_on_text(page):
+    datasetid = page.split('><p>')[0].replace('<div id=', '').replace('"', '').strip()
    conn, cursor = get_db()
-    cursor.execute('SELECT * from news;')
+    cursor.execute('SELECT * from files where dataset_id = "'+datasetid+'";')
    res = cursor.fetchall()
    data = []
    sentiments = []

--- a/func/upload/upload.py
+++ b/func/upload/upload.py
+from db.db_config import get_db
+
+def upload_file(page):
+
+
+    #filename = page.get('filename')
+    #last_modified = page.get('lastModified')
+    content = page.get('content')
+    dataset_id = page.get("dataset_id")
+    file_info = page.get("file")
+
+    conn, cursor = get_db()
+    cursor.execute("INSERT INTO files (dataset_id, filename, date, content) VALUES (?, ?, ?, ?)",
+                   (dataset_id, file_info["filename"], file_info["lastModified"], file_info["content"]))
+    conn.commit()
+    conn.close()
+    result = {'output': content, 'message': 'Done', 'code': 'SUCCESS'}
+    return result
--- a/func/usas/usas.py
+++ b/func/usas/usas.py
@@ -7,6 +7,8 @@ from db.db_config import get_db


 def run_usas_on_text(page):
+    print('usasoverall page old: ', page)
+    datasetid = page.split('><p>')[0].replace('<div id=', '').replace('"', '').strip()
    d = {}
    with open('/Users/tom/PycharmProjects/cognistance/func/usas/usas_overall.txt') as f:
        for line in f:
@@ -24,7 +26,8 @@ def run_usas_on_text(page):
    nlp.add_pipe('pymusas_rule_based_tagger', source=chinese_tagger_pipeline)

    conn, cursor = get_db()
-    cursor.execute('SELECT * from news;')
+    #cursor.execute('SELECT * from news;')
+    cursor.execute('SELECT * from files where dataset_id = "'+datasetid+'";')
    res = cursor.fetchall()
    data = []


--- a/func/usas/usas_overall.txt
+++ b/func/usas/usas_overall.txt
+A  general and abstract terms
+B  the body and the individual
+C  arts and crafts
+E  emotion
+F  food and farming
+G  government and public
+H  architecture, housing and the home
+I  money and commerce in industry
+K  entertainment, sports and games
+L  life and living things
+M  movement, location, travel and transport
+N  numbers and measurement
+O  substances, materials, objects and equipment
+P  education
+Q  language and communication
+S  social actions, states and processes
+T  Time
+W  world and environment
+X  psychological actions, states and processes
+Y  science and technology
+Z  names and grammar
\ No newline at end of file
--- a/func/usasFine/usasFine.py
+++ b/func/usasFine/usasFine.py
@@ -8,6 +8,8 @@ from db.db_config import get_db

 def run_usasFine_on_text(page):
    print('tag: ',page)
+    usastag = page.split('__')[0]
+    datasetid = page.split('__')[1].split('><p>')[0].replace('<div id=', '').replace('"', '').strip()
    d = {}
    with open('/Users/tom/PycharmProjects/cognistance/func/usas/usas_desc.txt') as f:
        for line in f:
@@ -25,11 +27,13 @@ def run_usasFine_on_text(page):
    nlp.add_pipe('pymusas_rule_based_tagger', source=chinese_tagger_pipeline)

    conn, cursor = get_db()
-    cursor.execute('SELECT * from news;')
+    #cursor.execute('SELECT * from files;')
+    cursor.execute('SELECT * from files where dataset_id = "' + datasetid + '";')
    res = cursor.fetchall()
    data = []

    for row in res:
+        print('row in usas fine: ',row)
        docid = row[0]
        content = row[-1].replace('\n', ' ').replace('\t', ' ')
        data.append([docid, content])
@@ -47,7 +51,7 @@ def run_usasFine_on_text(page):

        for token in output_doc:
            start, end = token._.pymusas_mwe_indexes[0]
-            idx = (start, end)
+            #idx = (start, end)

            for el in token._.pymusas_tags:
                el = el.split('.')[0]
@@ -66,7 +70,8 @@ def run_usasFine_on_text(page):
                    word = tag.split('__')[0]
                    usas = tag.split('__')[1]

-                    if page in usas:
+                    if usastag in usas:
+                        print('before tag object: ',word)
                        tag_object = {"0 Word":word,"1 Discourse Field": usas, "2 Definition":d[usas],"3 Frequency": freq}

                        usas_tags_with_count.append(tag_object)

--- a/func/usasFine/usas_desc.txt
+++ b/func/usasFine/usas_desc.txt
+A1  GENERAL AND ABSTRACT TERMS
+A1.1.1  General actions, making etc.
+A1.1.2  Damaging and destroying
+A1.2    Suitability
+A1.3    Caution
+A1.4    Chance, luck
+A1.5    Use
+A1.5.1  Using
+A1.5.2  Usefulness
+A1.6    Physical/mental
+A1.7    Constraint
+A1.8    Inclusion/Exclusion
+A1.9    Avoiding
+A2  Affect
+A2.1    Affect:- Modify, change
+A2.2    Affect:- Cause/Connected
+A3  Being
+A4  Classification
+A4.1    Generally kinds, groups, examples
+A4.2    Particular/general; detail
+A5  Evaluation
+A5.1    Evaluation:- Good/bad
+A5.2    Evaluation:- True/false
+A5.3    Evaluation:- Accuracy
+A5.4    Evaluation:- Authenticity
+A6  Comparing
+A6.1    Comparing:- Similar/different
+A6.2    Comparing:- Usual/unusual 
+A6.3    Comparing:- Variety
+A7  Definite (+ modals)
+A8  Seem
+A9  Getting and giving; possession
+A10 Open/closed; Hiding/Hidden; Finding; Showing
+A11 Importance
+A11.1   Importance: Important
+A11.2   Importance: Noticeability
+A12 Easy/difficult
+A13 Degree
+A13.1   Degree: Non-specific
+A13.2   Degree: Maximizers
+A13.3   Degree: Boosters
+A13.4   Degree: Approximators
+A13.5   Degree: Compromisers
+A13.6   Degree: Diminishers
+A13.7   Degree: Minimizers
+A14 Exclusivizers/particularizers
+A15 Safety/Danger
+B1  Anatomy and physiology
+B2  Health and disease
+B3  medicines and medical treatment
+B4  Cleaning and personal care
+B5  Clothes and personal belongings
+C1  Arts and crafts
+E1  EMOTIONAL ACTIONS, STATES AND PROCESSES General
+E2  Liking
+E3  Calm/Violent/Angry
+E4  Happy/sad
+E4.1    Happy/sad: Happy
+E4.2    Happy/sad: Contentment
+E5  Fear/bravery/shock
+E6  Worry, concern, confident
+F1  Food
+F2  Drinks
+F3  Cigarettes and drugs
+F4  Farming & Horticulture
+G1  Government, Politics and elections
+G1.1    Government etc.
+G1.2    Politics
+G2  Crime, law and order
+G2.1    Crime, law and order: Law and order
+G2.2    General ethics
+G3  Warfare, defence and the army; weapons
+H1  Architecture and kinds of houses and buildings
+H2  Parts of buildings
+H3  Areas around or near houses
+H4  Residence
+H5  Furniture and household fittings
+I1  Money generally
+I1.1    Money: Affluence
+I1.2    Money: Debts
+I1.3    Money: Price
+I2  Business
+I2.1    Business: Generally
+I2.2    Business: Selling
+I3  Work and employment
+I3.1    Work and employment: Generally
+I3.2    Work and employmeny: Professionalism
+I4  Industry 
+K1  Entertainment generally
+K2  Music and related activities
+K3  Recorded sound etc.
+K4  Drama, the theatre and showbusiness
+K5  Sports and games generally
+K5.1    Sports
+K5.2    Games
+K6  Childrens games and toys
+L1  Life and living things
+L2  Living creatures generally
+L3  Plants
+M1  Moving, coming and going
+M2  Putting, taking, pulling, pushing, transporting &c.
+M3  Vehicles and transport on land
+M4  Shipping, swimming etc.
+M5  Aircraft and flying
+M6  Location and direction
+M7  Places
+M8  Remaining/stationary
+N1  Numbers 
+N2  Mathematics
+N3  Measurement
+N3.1    Measurement: General
+N3.2    Measurement: Size 
+N3.3    Measurement: Distance
+N3.4    Measurement: Volume
+N3.5    Measurement: Weight
+N3.6    Measurement: Area
+N3.7    Measurement: Length & height
+N3.8    Measurement: Speed
+N4  Linear order
+N5  Quantities
+N5.1    Entirety; maximum
+N5.2    Exceeding; waste
+N6  Frequency etc.
+O1  Substances and materials generally
+O1.1    Substances and materials generally: Solid
+O1.2    Substances and materials generally: Liquid
+O1.3    Substances and materials generally: Gas
+O2  Objects generally
+O3  Electricity and electrical equipment
+O4  Physical attributes
+O4.1    General appearance and physical properties
+O4.2    Judgement of appearance (pretty etc.)
+O4.3    Colour and colour patterns
+O4.4    Shape
+O4.5    Texture
+O4.6    Temperature     
+P1  Education in general
+Q1  LINGUISTIC ACTIONS, STATES AND PROCESSES; COMMUNICATION
+Q1.1    LINGUISTIC ACTIONS, STATES AND PROCESSES; COMMUNICATION
+Q1.2    Paper documents and writing
+Q1.3    Telecommunications
+Q2  Speech acts
+Q2.1    Speech etc:- Communicative
+Q2.2    Speech acts
+Q3  Language, speech and grammar
+Q4  The Media
+Q4.1    The Media:- Books
+Q4.2    The Media:- Newspapers etc.
+Q4.3    The Media:- TV, Radio and Cinema
+S1  SOCIAL ACTIONS, STATES AND PROCESSES
+S1.1    SOCIAL ACTIONS, STATES AND PROCESSES
+S1.1.1  SOCIAL ACTIONS, STATES AND PROCESSES
+S1.1.2  Reciprocity
+S1.1.3  Participation
+S1.1.4  Deserve etc.
+S1.2    Personality traits
+S1.2.1  Approachability and Friendliness
+S1.2.2  Avarice
+S1.2.3  Egoism
+S1.2.4  Politeness
+S1.2.5  Toughness; strong/weak
+S1.2.6  Sensible
+S2  People
+S2.1    People:- Female
+S2.2    People:- Male  
+S3  Relationship
+S3.1    Relationship: General
+S3.2    Relationship: Intimate/sexual
+S4  Kin
+S5  Groups and affiliation
+S6  Obligation and necessity
+S7  Power relationship
+S7.1    Power, organizing
+S7.2    Respect
+S7.3    Competition
+S7.4    Permission
+S8  Helping/hindering
+S9  Religion and the supernatural
+T1  Time
+T1.1    Time: General
+T1.1.1  Time: General: Past
+T1.1.2  Time: General: Present; simultaneous
+T1.1.3  Time: General: Future
+T1.2    Time: Momentary
+T1.3    Time: Period
+T2  Time: Beginning and ending
+T3  Time: Old, new and young; age
+T4  Time: Early/late
+W1  The universe
+W2  Light
+W3  Geographical terms
+W4  Weather 
+W5  Green issues
+X1  PSYCHOLOGICAL ACTIONS, STATES AND PROCESSES
+X2  Mental actions and processes
+X2.1    Thought, belief
+X2.2    Knowledge
+X2.3    Learn
+X2.4    Investigate, examine, test, search
+X2.5    Understand
+X2.6    Expect
+X3  Sensory
+X3.1    Sensory:- Taste
+X3.2    Sensory:- Sound
+X3.3    Sensory:- Touch
+X3.4    Sensory:- Sight
+X3.5    Sensory:- Smell
+X4  Mental object
+X4.1    Mental object:- Conceptual object
+X4.2    Mental object:- Means, method
+X5  Attention
+X5.1    Attention
+X5.2    Interest/boredom/excited/energetic
+X6  Deciding
+X7  Wanting; planning; choosing
+X8  Trying
+X9  Ability
+X9.1    Ability:- Ability, intelligence
+X9.2    Ability:- Success and failure
+Y1  Science and technology in general
+Y2  Information technology and computing
+Z0  Unmatched proper noun
+Z1  Personal names
+Z2  Geographical names
+Z3  Other proper names
+Z4  Discourse Bin
+Z5  Grammatical bin
+Z6  Negative
+Z7  If
+Z8  Pronouns etc.
+Z9  Trash can
+Z99 Unmatched
\ No newline at end of file
--- a/main.py
+++ b/main.py
@@ -96,4 +96,31 @@ def neroverall():
    page = request_data['page']
    result = run_neroverall_all(page)

+    return result
+
+@app.route("/upload", methods=['POST'])
+def upload():
+
+    request_data = request.get_json()
+    page = request_data['page']
+    result = run_upload_all(page)
+
+    return result
+
+@app.route("/getdataset", methods=['POST'])
+def getdataset():
+    request_data = request.get_json()
+    page = request_data['page']
+    result = get_dataset_all(page)
+
+    return result
+
+
+@app.route("/getfiles", methods=['POST'])
+def getfiles():
+
+    request_data = request.get_json()
+    page = request_data['dataset_id']
+    result = get_files_all(page)
+
    return result
\ No newline at end of file