From 3b994a961e4a24ca2800de93a14006376e311a13 Mon Sep 17 00:00:00 2001 From: Tom Edwards <edwardstj1@cardiff.ac.uk> Date: Wed, 12 Jun 2024 09:34:09 +0100 Subject: [PATCH] Work towards USAS --- func/usas/usas.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/func/usas/usas.py b/func/usas/usas.py index 562fe62..1e7057c 100644 --- a/func/usas/usas.py +++ b/func/usas/usas.py @@ -4,6 +4,18 @@ import spacy # Perform USAS on Text def run_usas_on_text(page): + # We exclude the following components as we do not need them. + nlp = spacy.load('zh_core_web_sm', exclude=['parser', 'ner']) + # Load the Chinese PyMUSAS rule-based tagger in a separate spaCy pipeline + chinese_tagger_pipeline = spacy.load('cmn_dual_upos2usas_contextual') + # Adds the Chinese PyMUSAS rule-based tagger to the main spaCy pipeline + nlp.add_pipe('pymusas_rule_based_tagger', source=chinese_tagger_pipeline) + + output_doc = nlp(page) + + print(f'Text\tPOS\tUSAS Tags') + for token in output_doc: + print(f'{token.text}\t{token.pos_}\t{token._.pymusas_tags}') -- GitLab