diff --git a/README.md b/README.md index 01bca00a84dfad6fb07ffe6f11447bf5bbbd5c5a..bfc7da7444dccf795f56217129bc84b9c5d1c0f6 100644 --- a/README.md +++ b/README.md @@ -5,4 +5,5 @@ A simple flask server designed to house python data analysis scripts to be harne The current iteration of this tool focuses on the following types of analysis. - NER -- USAS \ No newline at end of file +- USAS +- Sentiment Analysis \ No newline at end of file diff --git a/api/api_functions.py b/api/api_functions.py index 7f65efb7e9b17bc186e07a11d65fbf407320aba9..44f42eb504eb55239b2e957c53eced978949522e 100644 --- a/api/api_functions.py +++ b/api/api_functions.py @@ -1,6 +1,7 @@ from flask import make_response, jsonify from func.ner.ner import * +from func.sentiment.sentiment import * from func.usas.usas import * @@ -18,7 +19,7 @@ def get_ner_for_data(page): # Perform USAS analysis on a file # TAKES XML text page -# Returns NER results +# Returns USAS results def get_usas_for_data(page): result = run_usas_on_text(page) @@ -26,3 +27,15 @@ def get_usas_for_data(page): return make_response(jsonify(result), 201) return make_response(jsonify(result), 400) + + +# Perform Sentiment analysis on a file +# TAKES XML text page +# Returns Sentiment results +def get_sentiment_for_data(page): + result = run_sentiment_on_text(page) + + if result["code"] == "SUCCESS": + return make_response(jsonify(result), 201) + + return make_response(jsonify(result), 400) \ No newline at end of file diff --git a/func/ner/ner.py b/func/ner/ner.py index b7a4e512312797be0c41ab82e35455542d914e56..a9d8977b95d5de68e8cb38e9602e64eaab55c015 100644 --- a/func/ner/ner.py +++ b/func/ner/ner.py @@ -1,30 +1,49 @@ import torch from ckip_transformers.nlp import CkipWordSegmenter, CkipPosTagger, CkipNerChunker from transformers import pipeline +import pandas as pd +from shared.translate import translate +#page = '尼罗河 是一æ¡æµç¶“éžæ´²æ±éƒ¨èˆ‡åŒ—部的河æµï¼Œèˆ‡ä¸éžåœ°å€çš„剛果河ã€éžæ´²å—部的赞比西河以åŠè¥¿éžåœ°åŒºçš„尼日尔河並列éžæ´²æœ€å¤§çš„四個河æµç³»çµ±ã€‚' # Perform NER on Text def run_ner_on_text(page): ner_driver = CkipNerChunker(model="bert-base") - data = [] + ner = ner_driver([page]) + tags = [] for item in ner[0]: word = item.word ner = item.ner - idx = item.idx - print(type(idx)) - print(idx) - print(type(ner)) - print(ner) - print(type(word)) - print(word) - print('--------') - obj = {"word": word, "translation": "", "ner": ner, "idx": idx} - - data.append(obj) + #idx = item.idx + + tags.append(word+'__'+ner) + + + ner_words_with_count = [] + seen_words = [] + for tag in tags: + if tag not in seen_words: + + freq = tags.count(tag) / 1000000 + word = tag.split('__')[0] + ner = tag.split('__')[1] + translation = translate(word).text + ner_words_with_count.append({"0 Word": word, "1 Translation":translation, "2 NER": ner, "3 Frequency": freq}) + seen_words.append(tag) + + + data = sorted(ner_words_with_count, key=lambda x: x["3 Frequency"], reverse=True) result = {'output': data,'message': 'Done', 'code': 'SUCCESS'} return result + + + + + + + diff --git a/func/sentiment/sentiment.py b/func/sentiment/sentiment.py new file mode 100644 index 0000000000000000000000000000000000000000..a7c5076365c9208c028653fde23458a0e610a72c --- /dev/null +++ b/func/sentiment/sentiment.py @@ -0,0 +1,34 @@ +from textblob.en import sentiment +from transformers import ( + BertTokenizerFast, + AutoModelForMaskedLM, + AutoModelForCausalLM, + AutoModelForSequenceClassification, +) + +from transformers import pipeline +import re + +def zng(paragraph): + for sent in re.findall(u'[^!?。\.\!\?]+[!?。\.\!\?]?', paragraph, flags=re.U): + yield sent + +#page = '尼罗河 是一æ¡æµç¶“éžæ´²æ±éƒ¨èˆ‡åŒ—部的河æµï¼Œèˆ‡ä¸éžåœ°å€çš„剛果河ã€éžæ´²å—部的赞比西河以åŠè¥¿éžåœ°åŒºçš„尼日尔河並列éžæ´²æœ€å¤§çš„四個河æµç³»çµ±ã€‚尼罗河 是一æ¡æµç¶“éžæ´²æ±éƒ¨èˆ‡åŒ—部的河æµï¼Œèˆ‡ä¸éžåœ°å€çš„剛果河ã€éžæ´²å—部的赞比西河以åŠè¥¿éžåœ°åŒºçš„尼日尔河並列éžæ´²æœ€å¤§çš„四個河æµç³»çµ±ã€‚' +def run_sentiment_on_text(page): + + pagesList = list(zng(page)) + tokenizer = BertTokenizerFast.from_pretrained('bardsai/finance-sentiment-zh-base') + model = AutoModelForSequenceClassification.from_pretrained('bardsai/finance-sentiment-zh-base') + nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer) + counts = dict() + for p in pagesList: + res = nlp(p)[0]['label'] + counts[res] = counts.get(res, 0) + 1 + + sentiments = [] + for k in counts.keys(): + sentiments.append({"Sentiment": k, "Count": counts[k]}) + + result = {'output': sentiments, 'message': 'Done', 'code': 'SUCCESS'} + return result + diff --git a/func/usas/usas.py b/func/usas/usas.py index 455792f76d682cb0aaa2e8864e81f8f434ea0ed6..3a73dc60b65c18f781909c54617eb4fe05a45358 100644 --- a/func/usas/usas.py +++ b/func/usas/usas.py @@ -2,7 +2,18 @@ import spacy # Perform USAS on Text +#page = '尼罗河 是一æ¡æµç¶“éžæ´²æ±éƒ¨èˆ‡åŒ—部的河æµï¼Œèˆ‡ä¸éžåœ°å€çš„剛果河ã€éžæ´²å—部的赞比西河以åŠè¥¿éžåœ°åŒºçš„尼日尔河並列éžæ´²æœ€å¤§çš„四個河æµç³»çµ±ã€‚' + + def run_usas_on_text(page): + d = {} + with open('/Users/tom/PycharmProjects/cognistance/func/usas/usas_desc.txt') as f: + for line in f: + lineL = line.replace('\n', '').split(' ', 1) + key = lineL[0].strip() + val = lineL[1].strip() + d[key] = val + # We exclude the following components as we do not need them. nlp = spacy.load('zh_core_web_sm', exclude=['parser', 'ner']) # Load the Chinese PyMUSAS rule-based tagger in a separate spaCy pipeline @@ -11,27 +22,32 @@ def run_usas_on_text(page): nlp.add_pipe('pymusas_rule_based_tagger', source=chinese_tagger_pipeline) output_doc = nlp(page) - data = [] + #data = [] tags = [] - print(f'Text\tPOS\tMWE start and end index\tUSAS Tags') for token in output_doc: start, end = token._.pymusas_mwe_indexes[0] idx = (start, end) for el in token._.pymusas_tags: - obj = {"word": token.text, "Usas Tags": el, "idx": idx} + #obj = {"word": token.text, "USAS Tags": el, "idx": idx} tags.append(el) - data.append(obj) + #data.append(obj) + + usas_tags_with_count = [] + seen_tags = [] + for tag in tags: + if tag not in seen_tags: + try: + freq = tags.count(tag)/1000000 + usas_tags_with_count.append({"0 Tag": tag, "1 Definition":d[tag],"2 Frequency": freq}) + except KeyError: + pass + seen_tags.append(tag) + + usas_tags_with_count = sorted(usas_tags_with_count, key=lambda x: x["2 Frequency"], reverse=True) + result = {'output': usas_tags_with_count, 'message': 'Done', 'code': 'SUCCESS'} - res = [] - procTags = [] - for x in tags: - if x not in procTags: - res.append({"Tag": x, "Count": tags.count(x)}) - - procTags.append(x) - - result = {'output': res, 'message': 'Done', 'code': 'SUCCESS'} return result + diff --git a/func/usas/usas_desc.txt b/func/usas/usas_desc.txt new file mode 100644 index 0000000000000000000000000000000000000000..2942cfeb026ed68c920f94d81c73fedc135a616f --- /dev/null +++ b/func/usas/usas_desc.txt @@ -0,0 +1,232 @@ +A1 GENERAL AND ABSTRACT TERMS +A1.1.1 General actions, making etc. +A1.1.2 Damaging and destroying +A1.2 Suitability +A1.3 Caution +A1.4 Chance, luck +A1.5 Use +A1.5.1 Using +A1.5.2 Usefulness +A1.6 Physical/mental +A1.7 Constraint +A1.8 Inclusion/Exclusion +A1.9 Avoiding +A2 Affect +A2.1 Affect:- Modify, change +A2.2 Affect:- Cause/Connected +A3 Being +A4 Classification +A4.1 Generally kinds, groups, examples +A4.2 Particular/general; detail +A5 Evaluation +A5.1 Evaluation:- Good/bad +A5.2 Evaluation:- True/false +A5.3 Evaluation:- Accuracy +A5.4 Evaluation:- Authenticity +A6 Comparing +A6.1 Comparing:- Similar/different +A6.2 Comparing:- Usual/unusual +A6.3 Comparing:- Variety +A7 Definite (+ modals) +A8 Seem +A9 Getting and giving; possession +A10 Open/closed; Hiding/Hidden; Finding; Showing +A11 Importance +A11.1 Importance: Important +A11.2 Importance: Noticeability +A12 Easy/difficult +A13 Degree +A13.1 Degree: Non-specific +A13.2 Degree: Maximizers +A13.3 Degree: Boosters +A13.4 Degree: Approximators +A13.5 Degree: Compromisers +A13.6 Degree: Diminishers +A13.7 Degree: Minimizers +A14 Exclusivizers/particularizers +A15 Safety/Danger +B1 Anatomy and physiology +B2 Health and disease +B3 medicines and medical treatment +B4 Cleaning and personal care +B5 Clothes and personal belongings +C1 Arts and crafts +E1 EMOTIONAL ACTIONS, STATES AND PROCESSES General +E2 Liking +E3 Calm/Violent/Angry +E4 Happy/sad +E4.1 Happy/sad: Happy +E4.2 Happy/sad: Contentment +E5 Fear/bravery/shock +E6 Worry, concern, confident +F1 Food +F2 Drinks +F3 Cigarettes and drugs +F4 Farming & Horticulture +G1 Government, Politics and elections +G1.1 Government etc. +G1.2 Politics +G2 Crime, law and order +G2.1 Crime, law and order: Law and order +G2.2 General ethics +G3 Warfare, defence and the army; weapons +H1 Architecture and kinds of houses and buildings +H2 Parts of buildings +H3 Areas around or near houses +H4 Residence +H5 Furniture and household fittings +I1 Money generally +I1.1 Money: Affluence +I1.2 Money: Debts +I1.3 Money: Price +I2 Business +I2.1 Business: Generally +I2.2 Business: Selling +I3 Work and employment +I3.1 Work and employment: Generally +I3.2 Work and employmeny: Professionalism +I4 Industry +K1 Entertainment generally +K2 Music and related activities +K3 Recorded sound etc. +K4 Drama, the theatre and showbusiness +K5 Sports and games generally +K5.1 Sports +K5.2 Games +K6 Childrens games and toys +L1 Life and living things +L2 Living creatures generally +L3 Plants +M1 Moving, coming and going +M2 Putting, taking, pulling, pushing, transporting &c. +M3 Vehicles and transport on land +M4 Shipping, swimming etc. +M5 Aircraft and flying +M6 Location and direction +M7 Places +M8 Remaining/stationary +N1 Numbers +N2 Mathematics +N3 Measurement +N3.1 Measurement: General +N3.2 Measurement: Size +N3.3 Measurement: Distance +N3.4 Measurement: Volume +N3.5 Measurement: Weight +N3.6 Measurement: Area +N3.7 Measurement: Length & height +N3.8 Measurement: Speed +N4 Linear order +N5 Quantities +N5.1 Entirety; maximum +N5.2 Exceeding; waste +N6 Frequency etc. +O1 Substances and materials generally +O1.1 Substances and materials generally: Solid +O1.2 Substances and materials generally: Liquid +O1.3 Substances and materials generally: Gas +O2 Objects generally +O3 Electricity and electrical equipment +O4 Physical attributes +O4.1 General appearance and physical properties +O4.2 Judgement of appearance (pretty etc.) +O4.3 Colour and colour patterns +O4.4 Shape +O4.5 Texture +O4.6 Temperature +P1 Education in general +Q1 LINGUISTIC ACTIONS, STATES AND PROCESSES; COMMUNICATION +Q1.1 LINGUISTIC ACTIONS, STATES AND PROCESSES; COMMUNICATION +Q1.2 Paper documents and writing +Q1.3 Telecommunications +Q2 Speech acts +Q2.1 Speech etc:- Communicative +Q2.2 Speech acts +Q3 Language, speech and grammar +Q4 The Media +Q4.1 The Media:- Books +Q4.2 The Media:- Newspapers etc. +Q4.3 The Media:- TV, Radio and Cinema +S1 SOCIAL ACTIONS, STATES AND PROCESSES +S1.1 SOCIAL ACTIONS, STATES AND PROCESSES +S1.1.1 SOCIAL ACTIONS, STATES AND PROCESSES +S1.1.2 Reciprocity +S1.1.3 Participation +S1.1.4 Deserve etc. +S1.2 Personality traits +S1.2.1 Approachability and Friendliness +S1.2.2 Avarice +S1.2.3 Egoism +S1.2.4 Politeness +S1.2.5 Toughness; strong/weak +S1.2.6 Sensible +S2 People +S2.1 People:- Female +S2.2 People:- Male +S3 Relationship +S3.1 Relationship: General +S3.2 Relationship: Intimate/sexual +S4 Kin +S5 Groups and affiliation +S6 Obligation and necessity +S7 Power relationship +S7.1 Power, organizing +S7.2 Respect +S7.3 Competition +S7.4 Permission +S8 Helping/hindering +S9 Religion and the supernatural +T1 Time +T1.1 Time: General +T1.1.1 Time: General: Past +T1.1.2 Time: General: Present; simultaneous +T1.1.3 Time: General: Future +T1.2 Time: Momentary +T1.3 Time: Period +T2 Time: Beginning and ending +T3 Time: Old, new and young; age +T4 Time: Early/late +W1 The universe +W2 Light +W3 Geographical terms +W4 Weather +W5 Green issues +X1 PSYCHOLOGICAL ACTIONS, STATES AND PROCESSES +X2 Mental actions and processes +X2.1 Thought, belief +X2.2 Knowledge +X2.3 Learn +X2.4 Investigate, examine, test, search +X2.5 Understand +X2.6 Expect +X3 Sensory +X3.1 Sensory:- Taste +X3.2 Sensory:- Sound +X3.3 Sensory:- Touch +X3.4 Sensory:- Sight +X3.5 Sensory:- Smell +X4 Mental object +X4.1 Mental object:- Conceptual object +X4.2 Mental object:- Means, method +X5 Attention +X5.1 Attention +X5.2 Interest/boredom/excited/energetic +X6 Deciding +X7 Wanting; planning; choosing +X8 Trying +X9 Ability +X9.1 Ability:- Ability, intelligence +X9.2 Ability:- Success and failure +Y1 Science and technology in general +Y2 Information technology and computing +Z0 Unmatched proper noun +Z1 Personal names +Z2 Geographical names +Z3 Other proper names +Z4 Discourse Bin +Z5 Grammatical bin +Z6 Negative +Z7 If +Z8 Pronouns etc. +Z9 Trash can +Z99 Unmatched \ No newline at end of file diff --git a/main.py b/main.py index 34bb595a783066c897c3afee751d1dc5fa6510b7..d6027af832e34d77c80d471ad823a7464a1f770c 100644 --- a/main.py +++ b/main.py @@ -38,3 +38,13 @@ def usas(): result = get_usas_for_data(page) return result + +@app.route("/sentiment", methods=['POST']) +def sentiment(): + + request_data = request.get_json() + print(request_data) + page = request_data['page'] + result = get_sentiment_for_data(page) + + return result diff --git a/requirements.txt b/requirements.txt index 228f9518e76fb535accc47338f6385cc08790d71..11e587fe5b159cc43c58b9b91f6a5238d3eaeff6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ ckip-transformers~=0.3.4 flask-cors~=4.0.1 spacy~=3.7.4 googletrans ~=3.1.0a0 +pandas ~=2.2.3 \ No newline at end of file diff --git a/shared/translate.py b/shared/translate.py new file mode 100644 index 0000000000000000000000000000000000000000..5c993a10052f608854fb410b31df248266e2d65c --- /dev/null +++ b/shared/translate.py @@ -0,0 +1,10 @@ +from googletrans import Translator + + +# wrapper for the googletrans library. Takes in chinese string returns english +def translate(word): + translator = Translator() + + result = translator.translate(word, src='zh-cn', dest='en') + + return result