diff --git a/api/api_functions.py b/api/api_functions.py index 8f2984856e0e2f6e10aa04656112e1025ac95a9c..2b83b6da513fb4a19dec90c8347fb2b0043e1678 100644 --- a/api/api_functions.py +++ b/api/api_functions.py @@ -4,6 +4,8 @@ from func.ner.ner import * from func.sentiment.sentiment import * from func.translation.translation import run_translation_on_text from func.usas.usas import * +from func.collocation.collocation import * +from func.concordance.concordance import * # Perform NER on a file @@ -51,4 +53,21 @@ def get_sentiment_for_data(page): if result["code"] == "SUCCESS": return make_response(jsonify(result), 201) - return make_response(jsonify(result), 400) \ No newline at end of file + return make_response(jsonify(result), 400) + +def get_collocation_for_data(page): + result = run_collocation_on_text(page) + + if result["code"] == "SUCCESS": + return make_response(jsonify(result), 201) + + return make_response(jsonify(result), 400) + +def get_concordance_for_data(page): + result = run_concordance_on_text(page) + + if result["code"] == "SUCCESS": + return make_response(jsonify(result), 201) + + return make_response(jsonify(result), 400) + diff --git a/func/collocation/collocation.py b/func/collocation/collocation.py new file mode 100644 index 0000000000000000000000000000000000000000..85db37eb60b4dbec08d062d9db9d5f883b0c69a6 --- /dev/null +++ b/func/collocation/collocation.py @@ -0,0 +1,64 @@ +import spacy +import math +from collections import Counter, defaultdict +from shared.translate import translate + +#page = '专精特新â€ä¼ä¸šï¼Œæ˜¯æŒ‡å…·æœ‰ä¸“业化ã€ç²¾ç»†åŒ–ã€ç‰¹è‰²åŒ–ã€æ–°é¢–化四大特å¾çš„ä¸å°ä¼ä¸šã€‚创新是这类ä¼ä¸šçš„çµé‚,足够的 ç ”å‘费用投入则是开展创新的é‡è¦ä¿éšœã€‚许多尚处在æˆé•¿æœŸçš„“专精特新â€ä¼ä¸šï¼Œè¿‘期普ééé‡â€œé’±ç´§â€éš¾é¢˜ã€‚å¦‚ä½•é›†èš æ›´å¤šçš„èµ„é‡‘æŠ•å…¥ç ”å‘ã€ä¿æŒåˆ›æ–°é¢†å…ˆåœ°ä½æ˜¯è¿™äº›ä¼ä¸šè¿‘æ¥é¢ä¸´çš„最大烦æ¼ã€‚“作为一家新ææ–™ç ”å‘å…¬å¸ï¼Œåˆ›æ–°æ˜¯æˆ‘们å‘展的é‡è¦é©±åŠ¨åŠ›ï¼Œåªæœ‰ç ”å‘投入的ä¸æ–åŠ ç ,ä¼ä¸šåˆ›æ–°å‘展的æ¥ä¼æ‰ä¸ä¼š é™é€Ÿã€‚â€æµ™æ±Ÿçœâ€œä¸“精特新â€ä¼ä¸šã€å®æ³¢åˆ›æ¶¦æ–°æ料有é™å…¬å¸è‘£äº‹é•¿å´æ™¯æ™–说,过去3年,ä¼ä¸šåœ¨ç ”å‘投入方é¢ä¸é—余力,累计投入2500万元,这对ä¼ä¸šæ¥è¯´ä¸æ˜¯ä¸ªå°æ•°ç›®ã€‚ ä»Šå¹´æ–°å…´å¸‚åœºçš„ç ”å‘需求ååˆ†è¿«åˆ‡ï¼Œæˆ‘ä»¬ä¸€ç›´æƒ³åŠ å¿« 超高纯钛åŠé’›åˆé‡‘ä¸è¯•ç”Ÿäº§çº¿é¡¹ç›® çš„ç ”å‘è¿›åº¦ï¼Œä½†è‹¦äºŽèµ„é‡‘ä¸ è¶³ã€‚ä»¤äººé«˜å…´çš„æ˜¯ï¼Œä»Šå¹´4月340万元å˜é‡å¢žå€¼ç¨Žç•™æŠµç¨Žé¢çš„到账,有效缓解了ä¼ä¸šçš„èµ„é‡‘åŽ‹åŠ›ï¼ŒåŠ å¿«äº†ä¼ä¸šçš„ç ”å‘ è¿›åº¦ã€‚â€å´æ™¯æ™–说,目å‰ï¼Œâ€œè¶…高纯钛åŠé’›åˆé‡‘ä¸è¯•ç”Ÿäº§çº¿é¡¹ç›®â€æ£åœ¨æœ‰åºæŽ¨è¿›ï¼Œä¸€æ—¦æŠ•äº§å°†ç¼“解åŠå¯¼ä½“产业的高纯钛原æ料供应ä¸è¶³é—®é¢˜ï¼Œæå‡å›½äº§æº…å°„é¶æ的市场竞争力' + +# Step 4: Calculate PMI for all Bigrams +def calculate_pmi(bigram, p_bigram, p_word): + word1, word2 = bigram + return math.log2(max(p_bigram[bigram], 1e-10) / (max(p_word[word1], 1e-10) * max(p_word[word2], 1e-10))) + +def escape(token: str): + token = token.replace("&", " ") + token = token.replace("-", " ") + token = token.replace("<", " ") + token = token.replace(">", " ") + token = token.replace("\"", " ") + token = token.replace("'", " ") + token = token.strip() + return token + +def run_collocation_on_text(page): + corpus = [] + + nlp = spacy.load('zh_core_web_sm') + doc = nlp(page) + for token in doc: + if not token.is_stop: + + #print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,token.shape_, token.is_alpha, token.is_stop) + corpus.append(escape(token.text.lower())) + + # Step 2: Calculate Frequencies + word_freq = Counter(corpus) + bigram_freq = Counter(zip(corpus[:-1], corpus[1:])) + + # Step 3: Calculate Probabilities + total_words = len(corpus) + p_word = defaultdict(float) + p_bigram = defaultdict(float) + + for word, freq in word_freq.items(): + p_word[word] = freq / total_words + + total_bigrams = len(corpus) - 1 + all_pmi_scores = [] + for bigram, freq in bigram_freq.items(): + p_bigram[bigram] = freq / total_bigrams + bigramstr = bigram[0]+' '+bigram[1] + translation = translate(bigramstr).text.lower() + pmi = calculate_pmi(bigram, p_bigram, p_word) + + all_pmi_scores.append({"0 Term": bigramstr,"1 Translation":translation ,"2 PMI Score": round(pmi,3)}) + + + + all_pmi_score = sorted(all_pmi_scores, key=lambda x: x["2 PMI Score"], reverse=True) + all_pmi_score = all_pmi_score[slice(40)] + result = {'output': all_pmi_score, 'message': 'Done', 'code': 'SUCCESS'} + print("PMI Scores:", all_pmi_scores) + + return result + diff --git a/func/concordance/concordance.py b/func/concordance/concordance.py new file mode 100644 index 0000000000000000000000000000000000000000..966a1a6d589183ca7d94aa6071afc9c63bd5d351 --- /dev/null +++ b/func/concordance/concordance.py @@ -0,0 +1,116 @@ +import spacy +import math +from collections import Counter, defaultdict +from shared.translate import translate +from wasabi import Printer +from spacy.matcher import PhraseMatcher +import re + +#page = '专精特新â€ä¼ä¸šï¼Œæ˜¯æŒ‡å…·æœ‰ä¸“业化ã€ç²¾ç»†åŒ–ã€ç‰¹è‰²åŒ–ã€æ–°é¢–化四大特å¾çš„ä¸å°ä¼ä¸šã€‚创新是这类ä¼ä¸šçš„çµé‚,足够的 ç ”å‘费用投入则是开展创新的é‡è¦ä¿éšœã€‚许多尚处在æˆé•¿æœŸçš„“专精特新â€ä¼ä¸šï¼Œè¿‘期普ééé‡â€œé’±ç´§â€éš¾é¢˜ã€‚å¦‚ä½•é›†èš æ›´å¤šçš„èµ„é‡‘æŠ•å…¥ç ”å‘ã€ä¿æŒåˆ›æ–°é¢†å…ˆåœ°ä½æ˜¯è¿™äº›ä¼ä¸šè¿‘æ¥é¢ä¸´çš„最大烦æ¼ã€‚“作为一家新ææ–™ç ”å‘å…¬å¸ï¼Œåˆ›æ–°æ˜¯æˆ‘们å‘展的é‡è¦é©±åŠ¨åŠ›ï¼Œåªæœ‰ç ”å‘投入的ä¸æ–åŠ ç ,ä¼ä¸šåˆ›æ–°å‘展的æ¥ä¼æ‰ä¸ä¼š é™é€Ÿã€‚â€æµ™æ±Ÿçœâ€œä¸“精特新â€ä¼ä¸šã€å®æ³¢åˆ›æ¶¦æ–°æ料有é™å…¬å¸è‘£äº‹é•¿å´æ™¯æ™–说,过去3年,ä¼ä¸šåœ¨ç ”å‘投入方é¢ä¸é—余力,累计投入2500万元,这对ä¼ä¸šæ¥è¯´ä¸æ˜¯ä¸ªå°æ•°ç›®ã€‚ ä»Šå¹´æ–°å…´å¸‚åœºçš„ç ”å‘需求ååˆ†è¿«åˆ‡ï¼Œæˆ‘ä»¬ä¸€ç›´æƒ³åŠ å¿« 超高纯钛åŠé’›åˆé‡‘ä¸è¯•ç”Ÿäº§çº¿é¡¹ç›® çš„ç ”å‘è¿›åº¦ï¼Œä½†è‹¦äºŽèµ„é‡‘ä¸ è¶³ã€‚ä»¤äººé«˜å…´çš„æ˜¯ï¼Œä»Šå¹´4月340万元å˜é‡å¢žå€¼ç¨Žç•™æŠµç¨Žé¢çš„到账,有效缓解了ä¼ä¸šçš„èµ„é‡‘åŽ‹åŠ›ï¼ŒåŠ å¿«äº†ä¼ä¸šçš„ç ”å‘ è¿›åº¦ã€‚â€å´æ™¯æ™–说,目å‰ï¼Œâ€œè¶…高纯钛åŠé’›åˆé‡‘ä¸è¯•ç”Ÿäº§çº¿é¡¹ç›®â€æ£åœ¨æœ‰åºæŽ¨è¿›ï¼Œä¸€æ—¦æŠ•äº§å°†ç¼“解åŠå¯¼ä½“产业的高纯钛原æ料供应ä¸è¶³é—®é¢˜ï¼Œæå‡å›½äº§æº…å°„é¶æ的市场竞争力' + +def calculate_pmi(bigram, p_bigram, p_word): + word1, word2 = bigram + return math.log2(max(p_bigram[bigram], 1e-10) / (max(p_word[word1], 1e-10) * max(p_word[word2], 1e-10))) + +def escape(token: str): + token = token.replace("&", " ") + token = token.replace("-", " ") + token = token.replace("<", " ") + token = token.replace(">", " ") + token = token.replace("\"", " ") + token = token.replace("'", " ") + token = token.strip() + return token + +def collocations(doc): + + corpus = [] + + for token in doc: + if not token.is_stop: + corpus.append(escape(token.text.lower())) + + # Step 2: Calculate Frequencies + word_freq = Counter(corpus) + bigram_freq = Counter(zip(corpus[:-1], corpus[1:])) + + # Step 3: Calculate Probabilities + total_words = len(corpus) + p_word = defaultdict(float) + p_bigram = defaultdict(float) + + for word, freq in word_freq.items(): + p_word[word] = freq / total_words + + total_bigrams = len(corpus) - 1 + all_pmi_scores = [] + for bigram, freq in bigram_freq.items(): + p_bigram[bigram] = freq / total_bigrams + bigramstr = bigram[0] + ' ' + bigram[1] + + pmi = calculate_pmi(bigram, p_bigram, p_word) + + all_pmi_scores.append({"0 Term": bigramstr, "2 PMI Score": round(pmi, 3)}) + + all_pmi_score = sorted(all_pmi_scores, key=lambda x: x["2 PMI Score"], reverse=True) + all_pmi_score = all_pmi_score[slice(40)] + terms = [item.get('0 Term') for item in all_pmi_score] + + + return terms + +def clean(text): + ansi_escape = re.compile(r''' + \x1B # ESC + (?: # 7-bit C1 Fe (except CSI) + [@-Z\\-_] + | # or [ for CSI, followed by a control sequence + \[ + [0-?]* # Parameter bytes + [ -/]* # Intermediate bytes + [@-~] # Final byte + ) + ''', re.VERBOSE) + cltext = ansi_escape.sub('', text) + + return str(cltext) + +def run_concordance_on_text(page): + nlp = spacy.load('zh_core_web_sm') + doc = nlp(page) + terms = collocations(doc) + concordances = [] + matcher = PhraseMatcher(nlp.vocab,attr='LOWER') + patterns = [nlp.make_doc(term) for term in terms] + matcher.add("TermCollocations", patterns) + + matches = matcher(doc) + match = Printer() + for i, start, end in matches: + perecedingSlice = clean(doc[start - 7: start].text) + + + perecedingSliceTr = clean(translate(doc[start - 7: start]).text) + matchedTerm = clean(match.text(doc[start:end].text, color='red', no_print=True)) + #matchedTerm = doc[start:end].text + matchedTermTr = clean(match.text(translate(doc[start:end].text).text, color='red', no_print=True)) + #matchedTermTr = match.text(translate(doc[start:end].text).text) + followingSlice = clean(doc[end:end + 7].text) + followingSliceTr = clean(translate(doc[end:end + 7]).text) + + context = perecedingSlice+', '+matchedTerm+', '+followingSlice + + contextTr = perecedingSliceTr+', '+matchedTermTr+', '+followingSliceTr + #concordances.append({"0 Term": escapeAnscii(matchedTerm), "1 Eng": escapeAnscii(matchedTermTr), "2 Context":escapeAnscii(context), "3 Context Eng":escapeAnscii(contextTr)}) + concordances.append({"0 Term": matchedTerm, "1 Eng": matchedTermTr,"2 Context": context, "3 Context Eng": contextTr}) + + print(concordances) + result = {'output': concordances, 'message': 'Done', 'code': 'SUCCESS'} + + return result + +#def main(): +# result = run_concordance_on_text(page) + +#main() + diff --git a/func/sentiment/sentiment.py b/func/sentiment/sentiment.py index a7c5076365c9208c028653fde23458a0e610a72c..34d4fda3cde5f283ce0770779854d41f2d18fbe5 100644 --- a/func/sentiment/sentiment.py +++ b/func/sentiment/sentiment.py @@ -25,9 +25,12 @@ def run_sentiment_on_text(page): res = nlp(p)[0]['label'] counts[res] = counts.get(res, 0) + 1 + if 'negative' not in counts.keys(): + counts['negative'] = 0 + sentiments = [] for k in counts.keys(): - sentiments.append({"Sentiment": k, "Count": counts[k]}) + sentiments.append({"0 Sentiment": k, "1 Count": counts[k]}) result = {'output': sentiments, 'message': 'Done', 'code': 'SUCCESS'} return result diff --git a/main.py b/main.py index 4a14e71ad676ae89f5f0b84675b9ae2d26e3f5cf..5eec639603704766370978254d88bf5b2942a3a3 100644 --- a/main.py +++ b/main.py @@ -53,3 +53,21 @@ def sentiment(): result = get_sentiment_for_data(page) return result + +@app.route("/collocation", methods=['POST']) +def collocation(): + + request_data = request.get_json() + page = request_data['page'] + result = get_collocation_for_data(page) + + return result + +@app.route("/concordance", methods=['POST']) +def concordance(): + + request_data = request.get_json() + page = request_data['page'] + result = get_concordance_for_data(page) + + return result diff --git a/shared/translate.py b/shared/translate.py index 5c993a10052f608854fb410b31df248266e2d65c..d269dd1acd35cfa46b009d4527360251b63e6b88 100644 --- a/shared/translate.py +++ b/shared/translate.py @@ -1,4 +1,8 @@ from googletrans import Translator +import csv +from os import listdir +from os.path import isfile, join +import time # wrapper for the googletrans library. Takes in chinese string returns english @@ -8,3 +12,31 @@ def translate(word): result = translator.translate(word, src='zh-cn', dest='en') return result + + +def get_csv(fileName): + with open(fileName, newline='') as csvfile: + data = list(csv.reader(csvfile)) + + return data + +def list_files(directory): + for f in listdir(directory): + if isfile(join(directory, f)): + print(f) + file = get_csv(directory + "/" +f) + + transfile = [] + + for line in file: + try: + translation = translate(line[0]) + transfile.append(translation.text) + time.sleep(1) + except Exception: + pass + + with open(directory + "/" + "trans" +f , 'w', newline='') as myfile: + wr = csv.writer(myfile, quoting=csv.QUOTE_ALL) + wr.writerow(transfile) +