Skip to content
Snippets Groups Projects
Commit 85589304 authored by Thomas Edwards's avatar Thomas Edwards
Browse files

Work towards USAS + ner

parent 0c57597d
No related branches found
No related tags found
No related merge requests found
...@@ -5,4 +5,5 @@ A simple flask server designed to house python data analysis scripts to be harne ...@@ -5,4 +5,5 @@ A simple flask server designed to house python data analysis scripts to be harne
The current iteration of this tool focuses on the following types of analysis. The current iteration of this tool focuses on the following types of analysis.
- NER - NER
- USAS - USAS
\ No newline at end of file - Sentiment Analysis
\ No newline at end of file
from flask import make_response, jsonify from flask import make_response, jsonify
from func.ner.ner import * from func.ner.ner import *
from func.sentiment.sentiment import *
from func.usas.usas import * from func.usas.usas import *
...@@ -18,7 +19,7 @@ def get_ner_for_data(page): ...@@ -18,7 +19,7 @@ def get_ner_for_data(page):
# Perform USAS analysis on a file # Perform USAS analysis on a file
# TAKES XML text page # TAKES XML text page
# Returns NER results # Returns USAS results
def get_usas_for_data(page): def get_usas_for_data(page):
result = run_usas_on_text(page) result = run_usas_on_text(page)
...@@ -26,3 +27,15 @@ def get_usas_for_data(page): ...@@ -26,3 +27,15 @@ def get_usas_for_data(page):
return make_response(jsonify(result), 201) return make_response(jsonify(result), 201)
return make_response(jsonify(result), 400) return make_response(jsonify(result), 400)
# Perform Sentiment analysis on a file
# TAKES XML text page
# Returns Sentiment results
def get_sentiment_for_data(page):
result = run_sentiment_on_text(page)
if result["code"] == "SUCCESS":
return make_response(jsonify(result), 201)
return make_response(jsonify(result), 400)
\ No newline at end of file
import torch import torch
from ckip_transformers.nlp import CkipWordSegmenter, CkipPosTagger, CkipNerChunker from ckip_transformers.nlp import CkipWordSegmenter, CkipPosTagger, CkipNerChunker
from transformers import pipeline from transformers import pipeline
import pandas as pd
from shared.translate import translate
#page = '尼罗河 是一条流經非洲東部與北部的河流,與中非地區的剛果河、非洲南部的赞比西河以及西非地区的尼日尔河並列非洲最大的四個河流系統。'
# Perform NER on Text # Perform NER on Text
def run_ner_on_text(page): def run_ner_on_text(page):
ner_driver = CkipNerChunker(model="bert-base") ner_driver = CkipNerChunker(model="bert-base")
data = []
ner = ner_driver([page]) ner = ner_driver([page])
tags = []
for item in ner[0]: for item in ner[0]:
word = item.word word = item.word
ner = item.ner ner = item.ner
idx = item.idx #idx = item.idx
print(type(idx))
print(idx) tags.append(word+'__'+ner)
print(type(ner))
print(ner)
print(type(word)) ner_words_with_count = []
print(word) seen_words = []
print('--------') for tag in tags:
obj = {"word": word, "translation": "", "ner": ner, "idx": idx} if tag not in seen_words:
data.append(obj) freq = tags.count(tag) / 1000000
word = tag.split('__')[0]
ner = tag.split('__')[1]
translation = translate(word).text
ner_words_with_count.append({"0 Word": word, "1 Translation":translation, "2 NER": ner, "3 Frequency": freq})
seen_words.append(tag)
data = sorted(ner_words_with_count, key=lambda x: x["3 Frequency"], reverse=True)
result = {'output': data,'message': 'Done', 'code': 'SUCCESS'} result = {'output': data,'message': 'Done', 'code': 'SUCCESS'}
return result return result
from textblob.en import sentiment
from transformers import (
BertTokenizerFast,
AutoModelForMaskedLM,
AutoModelForCausalLM,
AutoModelForSequenceClassification,
)
from transformers import pipeline
import re
def zng(paragraph):
for sent in re.findall(u'[^!?。\.\!\?]+[!?。\.\!\?]?', paragraph, flags=re.U):
yield sent
#page = '尼罗河 是一条流經非洲東部與北部的河流,與中非地區的剛果河、非洲南部的赞比西河以及西非地区的尼日尔河並列非洲最大的四個河流系統。尼罗河 是一条流經非洲東部與北部的河流,與中非地區的剛果河、非洲南部的赞比西河以及西非地区的尼日尔河並列非洲最大的四個河流系統。'
def run_sentiment_on_text(page):
pagesList = list(zng(page))
tokenizer = BertTokenizerFast.from_pretrained('bardsai/finance-sentiment-zh-base')
model = AutoModelForSequenceClassification.from_pretrained('bardsai/finance-sentiment-zh-base')
nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
counts = dict()
for p in pagesList:
res = nlp(p)[0]['label']
counts[res] = counts.get(res, 0) + 1
sentiments = []
for k in counts.keys():
sentiments.append({"Sentiment": k, "Count": counts[k]})
result = {'output': sentiments, 'message': 'Done', 'code': 'SUCCESS'}
return result
...@@ -2,7 +2,18 @@ import spacy ...@@ -2,7 +2,18 @@ import spacy
# Perform USAS on Text # Perform USAS on Text
#page = '尼罗河 是一条流經非洲東部與北部的河流,與中非地區的剛果河、非洲南部的赞比西河以及西非地区的尼日尔河並列非洲最大的四個河流系統。'
def run_usas_on_text(page): def run_usas_on_text(page):
d = {}
with open('/Users/tom/PycharmProjects/cognistance/func/usas/usas_desc.txt') as f:
for line in f:
lineL = line.replace('\n', '').split(' ', 1)
key = lineL[0].strip()
val = lineL[1].strip()
d[key] = val
# We exclude the following components as we do not need them. # We exclude the following components as we do not need them.
nlp = spacy.load('zh_core_web_sm', exclude=['parser', 'ner']) nlp = spacy.load('zh_core_web_sm', exclude=['parser', 'ner'])
# Load the Chinese PyMUSAS rule-based tagger in a separate spaCy pipeline # Load the Chinese PyMUSAS rule-based tagger in a separate spaCy pipeline
...@@ -11,27 +22,32 @@ def run_usas_on_text(page): ...@@ -11,27 +22,32 @@ def run_usas_on_text(page):
nlp.add_pipe('pymusas_rule_based_tagger', source=chinese_tagger_pipeline) nlp.add_pipe('pymusas_rule_based_tagger', source=chinese_tagger_pipeline)
output_doc = nlp(page) output_doc = nlp(page)
data = [] #data = []
tags = [] tags = []
print(f'Text\tPOS\tMWE start and end index\tUSAS Tags')
for token in output_doc: for token in output_doc:
start, end = token._.pymusas_mwe_indexes[0] start, end = token._.pymusas_mwe_indexes[0]
idx = (start, end) idx = (start, end)
for el in token._.pymusas_tags: for el in token._.pymusas_tags:
obj = {"word": token.text, "Usas Tags": el, "idx": idx} #obj = {"word": token.text, "USAS Tags": el, "idx": idx}
tags.append(el) tags.append(el)
data.append(obj) #data.append(obj)
usas_tags_with_count = []
seen_tags = []
for tag in tags:
if tag not in seen_tags:
try:
freq = tags.count(tag)/1000000
usas_tags_with_count.append({"0 Tag": tag, "1 Definition":d[tag],"2 Frequency": freq})
except KeyError:
pass
seen_tags.append(tag)
usas_tags_with_count = sorted(usas_tags_with_count, key=lambda x: x["2 Frequency"], reverse=True)
result = {'output': usas_tags_with_count, 'message': 'Done', 'code': 'SUCCESS'}
res = []
procTags = []
for x in tags:
if x not in procTags:
res.append({"Tag": x, "Count": tags.count(x)})
procTags.append(x)
result = {'output': res, 'message': 'Done', 'code': 'SUCCESS'}
return result return result
A1 GENERAL AND ABSTRACT TERMS
A1.1.1 General actions, making etc.
A1.1.2 Damaging and destroying
A1.2 Suitability
A1.3 Caution
A1.4 Chance, luck
A1.5 Use
A1.5.1 Using
A1.5.2 Usefulness
A1.6 Physical/mental
A1.7 Constraint
A1.8 Inclusion/Exclusion
A1.9 Avoiding
A2 Affect
A2.1 Affect:- Modify, change
A2.2 Affect:- Cause/Connected
A3 Being
A4 Classification
A4.1 Generally kinds, groups, examples
A4.2 Particular/general; detail
A5 Evaluation
A5.1 Evaluation:- Good/bad
A5.2 Evaluation:- True/false
A5.3 Evaluation:- Accuracy
A5.4 Evaluation:- Authenticity
A6 Comparing
A6.1 Comparing:- Similar/different
A6.2 Comparing:- Usual/unusual
A6.3 Comparing:- Variety
A7 Definite (+ modals)
A8 Seem
A9 Getting and giving; possession
A10 Open/closed; Hiding/Hidden; Finding; Showing
A11 Importance
A11.1 Importance: Important
A11.2 Importance: Noticeability
A12 Easy/difficult
A13 Degree
A13.1 Degree: Non-specific
A13.2 Degree: Maximizers
A13.3 Degree: Boosters
A13.4 Degree: Approximators
A13.5 Degree: Compromisers
A13.6 Degree: Diminishers
A13.7 Degree: Minimizers
A14 Exclusivizers/particularizers
A15 Safety/Danger
B1 Anatomy and physiology
B2 Health and disease
B3 medicines and medical treatment
B4 Cleaning and personal care
B5 Clothes and personal belongings
C1 Arts and crafts
E1 EMOTIONAL ACTIONS, STATES AND PROCESSES General
E2 Liking
E3 Calm/Violent/Angry
E4 Happy/sad
E4.1 Happy/sad: Happy
E4.2 Happy/sad: Contentment
E5 Fear/bravery/shock
E6 Worry, concern, confident
F1 Food
F2 Drinks
F3 Cigarettes and drugs
F4 Farming & Horticulture
G1 Government, Politics and elections
G1.1 Government etc.
G1.2 Politics
G2 Crime, law and order
G2.1 Crime, law and order: Law and order
G2.2 General ethics
G3 Warfare, defence and the army; weapons
H1 Architecture and kinds of houses and buildings
H2 Parts of buildings
H3 Areas around or near houses
H4 Residence
H5 Furniture and household fittings
I1 Money generally
I1.1 Money: Affluence
I1.2 Money: Debts
I1.3 Money: Price
I2 Business
I2.1 Business: Generally
I2.2 Business: Selling
I3 Work and employment
I3.1 Work and employment: Generally
I3.2 Work and employmeny: Professionalism
I4 Industry
K1 Entertainment generally
K2 Music and related activities
K3 Recorded sound etc.
K4 Drama, the theatre and showbusiness
K5 Sports and games generally
K5.1 Sports
K5.2 Games
K6 Childrens games and toys
L1 Life and living things
L2 Living creatures generally
L3 Plants
M1 Moving, coming and going
M2 Putting, taking, pulling, pushing, transporting &c.
M3 Vehicles and transport on land
M4 Shipping, swimming etc.
M5 Aircraft and flying
M6 Location and direction
M7 Places
M8 Remaining/stationary
N1 Numbers
N2 Mathematics
N3 Measurement
N3.1 Measurement: General
N3.2 Measurement: Size
N3.3 Measurement: Distance
N3.4 Measurement: Volume
N3.5 Measurement: Weight
N3.6 Measurement: Area
N3.7 Measurement: Length & height
N3.8 Measurement: Speed
N4 Linear order
N5 Quantities
N5.1 Entirety; maximum
N5.2 Exceeding; waste
N6 Frequency etc.
O1 Substances and materials generally
O1.1 Substances and materials generally: Solid
O1.2 Substances and materials generally: Liquid
O1.3 Substances and materials generally: Gas
O2 Objects generally
O3 Electricity and electrical equipment
O4 Physical attributes
O4.1 General appearance and physical properties
O4.2 Judgement of appearance (pretty etc.)
O4.3 Colour and colour patterns
O4.4 Shape
O4.5 Texture
O4.6 Temperature
P1 Education in general
Q1 LINGUISTIC ACTIONS, STATES AND PROCESSES; COMMUNICATION
Q1.1 LINGUISTIC ACTIONS, STATES AND PROCESSES; COMMUNICATION
Q1.2 Paper documents and writing
Q1.3 Telecommunications
Q2 Speech acts
Q2.1 Speech etc:- Communicative
Q2.2 Speech acts
Q3 Language, speech and grammar
Q4 The Media
Q4.1 The Media:- Books
Q4.2 The Media:- Newspapers etc.
Q4.3 The Media:- TV, Radio and Cinema
S1 SOCIAL ACTIONS, STATES AND PROCESSES
S1.1 SOCIAL ACTIONS, STATES AND PROCESSES
S1.1.1 SOCIAL ACTIONS, STATES AND PROCESSES
S1.1.2 Reciprocity
S1.1.3 Participation
S1.1.4 Deserve etc.
S1.2 Personality traits
S1.2.1 Approachability and Friendliness
S1.2.2 Avarice
S1.2.3 Egoism
S1.2.4 Politeness
S1.2.5 Toughness; strong/weak
S1.2.6 Sensible
S2 People
S2.1 People:- Female
S2.2 People:- Male
S3 Relationship
S3.1 Relationship: General
S3.2 Relationship: Intimate/sexual
S4 Kin
S5 Groups and affiliation
S6 Obligation and necessity
S7 Power relationship
S7.1 Power, organizing
S7.2 Respect
S7.3 Competition
S7.4 Permission
S8 Helping/hindering
S9 Religion and the supernatural
T1 Time
T1.1 Time: General
T1.1.1 Time: General: Past
T1.1.2 Time: General: Present; simultaneous
T1.1.3 Time: General: Future
T1.2 Time: Momentary
T1.3 Time: Period
T2 Time: Beginning and ending
T3 Time: Old, new and young; age
T4 Time: Early/late
W1 The universe
W2 Light
W3 Geographical terms
W4 Weather
W5 Green issues
X1 PSYCHOLOGICAL ACTIONS, STATES AND PROCESSES
X2 Mental actions and processes
X2.1 Thought, belief
X2.2 Knowledge
X2.3 Learn
X2.4 Investigate, examine, test, search
X2.5 Understand
X2.6 Expect
X3 Sensory
X3.1 Sensory:- Taste
X3.2 Sensory:- Sound
X3.3 Sensory:- Touch
X3.4 Sensory:- Sight
X3.5 Sensory:- Smell
X4 Mental object
X4.1 Mental object:- Conceptual object
X4.2 Mental object:- Means, method
X5 Attention
X5.1 Attention
X5.2 Interest/boredom/excited/energetic
X6 Deciding
X7 Wanting; planning; choosing
X8 Trying
X9 Ability
X9.1 Ability:- Ability, intelligence
X9.2 Ability:- Success and failure
Y1 Science and technology in general
Y2 Information technology and computing
Z0 Unmatched proper noun
Z1 Personal names
Z2 Geographical names
Z3 Other proper names
Z4 Discourse Bin
Z5 Grammatical bin
Z6 Negative
Z7 If
Z8 Pronouns etc.
Z9 Trash can
Z99 Unmatched
\ No newline at end of file
...@@ -38,3 +38,13 @@ def usas(): ...@@ -38,3 +38,13 @@ def usas():
result = get_usas_for_data(page) result = get_usas_for_data(page)
return result return result
@app.route("/sentiment", methods=['POST'])
def sentiment():
request_data = request.get_json()
print(request_data)
page = request_data['page']
result = get_sentiment_for_data(page)
return result
...@@ -5,3 +5,4 @@ ckip-transformers~=0.3.4 ...@@ -5,3 +5,4 @@ ckip-transformers~=0.3.4
flask-cors~=4.0.1 flask-cors~=4.0.1
spacy~=3.7.4 spacy~=3.7.4
googletrans ~=3.1.0a0 googletrans ~=3.1.0a0
pandas ~=2.2.3
\ No newline at end of file
from googletrans import Translator
# wrapper for the googletrans library. Takes in chinese string returns english
def translate(word):
translator = Translator()
result = translator.translate(word, src='zh-cn', dest='en')
return result
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment