From 24abc819d3a7659399167ddb0e496457bd252807 Mon Sep 17 00:00:00 2001 From: Tom Edwards <edwardstj1@cardiff.ac.uk> Date: Wed, 26 Mar 2025 09:54:52 +0000 Subject: [PATCH] demo branch --- db/datasets.db | Bin 1445888 -> 1445888 bytes func/collocation/collocation.py | 27 ++++++++++++++++++++++----- func/concordance/concordance.py | 17 +++++++++++++---- func/translation/translation.py | 1 + 4 files changed, 36 insertions(+), 9 deletions(-) diff --git a/db/datasets.db b/db/datasets.db index 8489e3980dc54d847378b5dff7902c70befe1462..cde875a2ec60e84f813b1fc79f999e817796427e 100644 GIT binary patch delta 5581 zcma)A?Q<1Z8Q;Bk_uk~*-0aO;OLA{W=m;%E>)H1`Kp+(Qf}}K1gkYhh5-KeUmr?;+ zD51Lpv|}{{%^k;%hUygCY8yy%*AKq5opHoZ#=oF2ezBudnSK)@_&b}JoPBY|4}12^ zy}$FE=RD{4yzTJ#!0`CMNU89tAc#-nc>>Rh^Y_`}r<VtR^5w=<mn;avawUgR%w1?8 zUwA<Io^nokMLD4yRVvDJ%8>GuGN5c#`jm&2US*|XC=V$2DfcL4<u0YD<R(?}e&eaX zd5a-H^Ex3V>y<Gd^GcXM=`F%o^4c)=dd(Qud5!q@122!c<zZXV^y=};@4YPMf+u1A znkQo3?Y@k0vHK!^`P@B*@sRrh#v|@w{C=-{5c86I0OQG1?tZ)q?lbsfjk_1~cilmZ zC){25<q>x$=5M$Im~V7<V7}emhVf%}3&wVLGse^ICX74WM=^G~4`Hl#*JCWZ4`RIG z_F}y5u7!l9?rO|scU1welrSJjzYxTM3o7aBJlE&0$_nb;JtOqsp`m?WQf)&sO;x3a zZ5h>z_hd`{W7F~?X>qO3T?t@&&p+%qkXA2ll1u*ep{CA!O8EJk{oQZ%*B3`CvM-Z{ z)#;MEKG@^)Jh#-;sX;BhdU3Q+@{j$j>0E<I-y9)hy`q#$r;9><B0HEa;;vHZGPHaq zs->kl(ABZ3e`%^)H2MQ+R!?@IeQ~I*>lWbLff#To0jS~pj>Z8;N90L)`qtW7B_$Ux z$=%{$eO$ZEQAwsnOhdC0w0kT8YG|sa!ie+cW+(280ZmG6+oItUk5yVTer80xtwhbn zj(8v{#DXPEOCwHXx91XohD{wcyc?7GYywoLwxQdc%&}sfoXw01`Hh)TT;iEb=c~H= zAXc{bjLIPzfNWS<5kZ5~f6t$i7;s%2Xc!jJ3_1^VrMi^3OpwOKWsu(?cAh!vJ``y7 zdV!a3JGup%onBSf;s3TlT%Yb7+vRQufWO?y15MqanpNF0B6s+^52u&snO_Tj=1S(@ zg-L_l5l5%xv+_0Zl6);i=7{Ze27)+ZQ<V_SQ3<%VK<w>jy3=R6rCPTvkz6|Uc0lgb z+tB;RDR$g#vE$kr{Da*RXBU|MiC{b4S8YuL%`C#1p54*bI8b$T(}2G$uxxY$t`^p& z3~e5bX=7SCR@3Ssu~447B<%4>Y!A~?9gBwChbi-L9B5KUcO0JbV;|N_XC;ITsgjvF zG&&uN0wqX#Au7qzZ4LIWUf!DOT!;Wi5@5t|Y&B}8=i@+&T9!t6_h6d&(v6HrBA(CA zb~r(TP1)sfhGxJ@9BAm2#C1B908}kvYkc^sk*qYF<nX(qt4-T54Qi`VeV$AJYNkag zZzC3mzLo$aND+4R{PJRX*64w^3;pm$q5pzPHn0QlkFEoWsBRL%hY9A4-w1%5GcEuZ zoUt~YWxf**y=<FJ(QQl<F0ZL1df=}l08NuxMo1q>+?W7VDIuYlSu55hUVtY?S39bz z8>VAN^znoQAk-!%?-lHXr{ciC{)7k4l61Uo=Kax1%yl5eE5)Q}DlWu{62t3A0BS0n zl@A?EVtWEm)h*;Qj{38D4c5<e)haD=(U(W+-&@f!5M658h|-#E^W>|+?<&63-y~vL zZ|=_Hm!#fQu|X2AB7WxfXBLT95kY?~G8+xX1=^}*P=W$uKC3d|o&+GVspatA)%nT> z_bC9|d%SxWQiN^kR7Fno!TS}#uDj#H<}lo@wTYZG>`UE^e@d6Ud?59ohv%g_s!53h zRsHeN!rcwxJGI`tG@d|oFB{#=Q4Qoa&ERdu?j;ikBIgswi2~&Wpk`^B9*Ta0GU|&P z`I}vhtb9!Hz%5ZRrmdQ)PK>DS`(r@l3&cQV-S8zn9tYx%RMci15X?e8?r!kq?!ui< za%t%zijDdK=^{vRA*dqOLQJIG&~1WJOXt#R*H8DlYXac!_RZglX=)TlFbp{3`6;>V zZ=1+m5trcni;*CE;&SKUoZMQmQ65CT7bF6OmPuXy25)h&@UsO@p;}N`v@?;y4mXP( z{<B8?SYG^5ZMadc_;OL~PnCLuMN>yO<qQ>7s-<gHb#<!4f2~=3sW2rgcUT2DBs56b z57I-}b9<T*{uq{r%6p_t0~LcAVS{@#0jN_G`IR$z=h@@#w*!T~ugohnmxNW-U$U$5 zuW4{ynfdeoV2dY3w{buf%^^`w_0s<@Fck-CR5f*z=ik^wE;E%W-rh4Q4ZoK!dFALL zu5g<V7^wYyxl+8<b!GTUzQgNESQA`dxJI50dHn_2<o8@I{_(RUXKss%ax|jakpRe? zxito4F`F0~AN(1xB>`wsLkj_!75vOZ)6K52i8hudhFB@e)8)mO7(#XG7_3v9w+myz zcK~ef;aK1h$3X3?Q-8@!>XU$b$K%3S#Ig-O*v{&7t?9Nh7us3hHy7H=2*fW38*zac z$JZ$OIUhye7lSx_GPVyOre&CXykO37A_gSrsZ<@V#GMYnmlA*uLah@DjYv>aQZrrG zu{zN4X3M-wpog|i<O)*lA3<mQ{CV;9MzQX#{^sJ?wR}>3b`M5(M5pFhCQ2qwHC6+E zD*=eqWL2k|>e@q%@Hdl?AS^2Rh11TmwbCq3XN8Xhxk>owf=Z?sqw1r}k*h2g5<>;k z4-a^C0q~t;^QT8<LR8mmT2)uJmi&!>X!%WUQXc+QzTL|u2z4w?udZ9tT=x5><#ve` zlfFFIa;vL$BOR0sEqB(=H7c2Z{kLs`CH6*TS(;^_P!1JptiSp?fT2VHpl(z4^&s3| zJ)IzN4#-(7aH4B1H<cpGh_m%bc95?}yymDN2NjfK+M(M-nc+1Bs)g7MbD%jQ^JRut zOb})f^bY(zqEz>kI1r@`I<jzp3i`fSCG9mPthP<03Vs_#j6wns-IW&V=NPad4zz51 zDF}6;wMuJ`==U_WUft}=ABv9Hu@lSMdw6e}uMRnHYQ{pTILnKTAzti=Zj0&?eIky+ z@t%c)?Fm3)P)hmu#0d1&KqLOc1x`y)G!l+LwUO5Bh<raNLHvg9Hbvfj5_3>QT!Z(! zcuufSdwAokqVNX$w1@Y0JRjp}$8#DF``CwXk^;UkiWKiMo*Q`B*UIboZ7JUD18ALp zd}-Tv@1K;e%_z0OHmNr$%`A!@5NQS-x)$v*yt@;Cw#B;iu*tl;5`Y#yR)j+BtOrlD H-P-dX0ZXYu delta 20404 zcma)^d2FNUdDx{HX%5cL?Ck7rot??<Og7%E9Xq?EZY5pYX^glIoOqK0j%^_IrbvPy zEz)rcBq@^SBgLUaU8HX6zDC!KMih0DB1LhL0u9pq<NjB)2;jv04*t>ZCP0y*HsJoA z_xVIoGvhYEh(q#wkLP{v>t%TH>*2+(58b)`$8~k}zscXf;O|SrpK7oF<DC!Y-@Q?v zX{@WOyY=E2uk~LZeIfYCz26KzxiNY_nE$={FGb(w>pzXY!|P`BXBhFh=x2HVwdiMf z{q^Wi^Ew;-G_SX!pW^ju^le^$Jo;0-Mxt-=`Zv)}^7=;fP3Cq+e}eZPNAL0aTJ$d8 zycPX%-Zw_?9NxS7a^s!ooy&EhPrUc|kADB({?>2*OsKsj6zuG1>1gU`X-b8D_LVz( zUtTQ)_8Q**!QGK?W8;-~>aM&4u%Ep0M)FspSD5<Rd-FecHnpue7z(wu1)EY!%QtRk zf9HQ+p1+)J$XwmNcKdC!&roM|a8p~bGuYAA+L?{K{9<#%$afm|8i&6B?nLm#hHS&o zEA=<`8o%@X&rgIeHf9??zVZHrJD)P!9sIjzR{+$1df|4q@9%H?$@TTdd-We&y%ekq z+`94;mx2KM=B3w{{zLSx+5bP@sm|t{9ijG6XItvq_1AA_fAMp#<m&I%tuXYBf85aP z{l6D|-M}<)2GiN9!T<Kkoi8>8ziIF6c&nv*-wTKNS6Q1V=N`Lb3vTZtd*zY);Lyp8 z`Ooa^YB@LU&g_>@W-3cZ?o8V08}~{>sh92qK4IxH?rh|_OFTQ*p51U}$L#F9o$Ipm zV|JpXt9^7lP|mO0J$ZNffjhSAMzhYrsGVQ)PEPE2rnBQMzPE?s&f@VoD6Py`WhrmR zAK6ETc5F4^_C$f)O)t5{VV2}u0Cy6r!0V1}l=BJQvodw)j%L}AlT~KA-IEm=oSt_R znR33{neBCE^LBp0K8}JA+4<zLo7iR>7WrPdD-Z~H+pB^MYbNcE?bw3}XCk9}vz$G% z%%SW;UKY)dnT@-_o83ET&lK(MSY^4}j>TEGoZE78kL<*NlUcWu!<E>9(-(GTx0zT8 zPkW^vdu6)PGYA^J-n^IHcJrlLK;2#Iv9q(}^*pxwHtc~3d*y*U)E%%#2A%>~>1H48 z+sSPm<o3?EyN~(D{H9<z*JBTD$kIpqV8z)z28*F!Fjy{5@yy-qcP5V9oxLht-h&-? zdCy5&LvCK6)p_SglNoDilZAD1Kxdz=f4Cb8z0>}jjdKqxJxMpe)~3%C-ES)0%Ej4o zF7J-dy1nc6%DSDf3_f@_-!EUy=Dmmz6o_QdT9E^mPfF!nSlD24+n&idy&1bWYR_hb zpvzH+0D`@>h&{9D?k}>N;)?IRp(4ZJ9B*lW1)bEgot%6dws;#D-nP?oZ+~=b4S4Gh z19tZwY*V{zW%{JDJjpDE+I@?5Y12t;c%_*tR6@3c2Tp!S$Y%Ho`a1)wtfTNaW82Qi zFb9Tx1i{I;EM*|~r3zM`u%-aiNe($n5ADGo7YP~}C>ImZhIO5>qd+CR$YVrT`P%NA zk%b2$Ea@F&5iJ4yByO)3S;$?<djnBBw+1`u&fw$Cs)*a&V1JrdXKvU|#N;d%NZ$$b zuQFcpGRyYhsJFeZBy$GS&ykCXj2A7+v6I90&N%#Um!jodp<F1~@kw_zUoP~!g+W2w z$(++W>5JJ+T2Szelz9<r45fgO*y+P^Zd9asYS~#_0t@c?u)8=WAA7?x`4}k21qr*y z0q1bnUO4i)cI}h2mnwWO9C2rc+~pm}4bHsYZFj5Jj?W;|DvrVo5!uNZE7-9;)^#5p z3XFx{T2-x~obT;#=;>6mlyeFF-WeQ+B5b3Y#YydeUB#v>9bS_|7=#22?mjw>J6mgZ zCh6qnK05Br{xDhJc=O7I3x8G@c>ltmVLSfdf*88XM$4ZwbWQD{V0&BBnW3A${CfXq zqnFIc@fvH|=8w$&P3x7>#)%KFU&>zk_{LW+-1#Xp^t?H1Ywl<bVm;5yS+?;Dw>s)? zUpIsQ&&y}ltF5^mW82)4dVi?#PWBIm>j!UK`}6CV_WFyzQg`uJfc=XX?@qlFeUq7Y z-rM>2)tODLo$VdLP*boeB>UX4+P>*Nd14Pn%13kdPM>hfOwozO?TH7@+N@wJ7rj@` z_3=T!EgwgqzcV=G9V}LQhAYc+cK4uCv9dXd$`1JB^8GSya@$=wDIZTe6Ftsk0?~A~ zRw0Hje2YDnT}qXY7>a%DvNPS@{<6J2AbNdhSQip4zOZ1}U{$weK6=tVPNMr!d%A&$ z`Ro&jZzsEiBXrl@0-Bn+PJ9Tt4(RITd>&IIB77XN0I~{CCEV-uKLQXZwTcyCUM+55 z7tq!Or#yuvXBunOCn!Dlz@SL^%^RI_r_<%5weN-d6{jk^0ms@eh!!ZU3bqHwPaj6~ zq;|a9nIGa%$gY#xaE`~k^e*7q#bu5UeD?N{>?<ZGzvh$8?5oQ;;~De=SVy-alt`49 z-jIXmSDb?gFnrH?7Vv<-w~>@{kk@Wjd(^U_wZNrwPWQ888_v`=kUGhvSIP*54MRER z(3PGZJ84Z|Y~e3493xxRqm^?fY7TT8s2PBXL?ahxRY&lVf9S)?RNUU0bQX`$CSK3J zcQOxNDr-|tZrkhMai@FKRRDuFZou4zolL<E!YJv8y>dN?g(>GUIw<vD9=`ahWyhxN z;u_zxVoO`OIA~Ay+Ub)j8UqS4#3|=DPYF4gcDj21u^dw?Q0UZ9b1S>RUyu`HC(E2s z6{Y2+my{-LP49Gu+B(bmk{yeFbUahLjtXX|H53X2SW}pzW@b}M@SRX+OS?`5Xk^4b z3Bx$<OwQRJV>setG=OzZ1w6Y2Ooi(J7b#3mxFAYkt_LMxC%WzAE(#r0Ak@YStzqkc zkar>PE`?dNX2Wy)c_nKS*|%d^w0ODj&|BJY!s{}#hQ~RdsClEN^xQ(vg**xgrl>yh zeGK^Glp|pfXB;gdzGa}wIqZQ=7KmiUuRKtGMt3Rqu`sxC3rXi7VaE@VG}$3DjMXWZ z<|@;(`lVRI8m*&Wrgm<<^s1$}5Wd1^#5YYJa-hobsNJ_y&h@i#0E9VX;#=1$Cv+1- zF@4TN5kJgZf8cERoFMa;hn**4gYh<isS+MSw~80Evd(@!1BqstvsT&SsRiupuAIo| zMRDBL2c2RXr65>HY=}<EZ3WJ3!9<UEIy3vM?@Xobq@ka|KfA*Y<=mh%v1LC@v&|_Y z9{{<RJ#bRisyl;e$)MTKp7kw=5g&-Sg)OuKKA9ltsKk7wtFCL$?7Icuff20F4%&T7 zj0xZj$)Vt|-a~JHQ%Ey|hKa~WUdM;+trb4neUF^<ul1LYC!IdS7(D2E<VH^Hw0H{c z#HJICRVH_x1W1eE%{b#z*eZJv!H%f!3XPq<74cP8rU~EF9;JTx&aEG_jJX?3qgwG# z0jE|e!uiB37zR7`gyTD@L#K~3in-#v#w(n`dNGT@qA3_`5eCP7TJLwZlOmVLc&vx2 zFa>KykRYeMU2u9&_~<NJS&<8ae>o~=?0q7!RRLewZ0hcwxU2Kx(WX<_4}p*cj37aq z%2Z56m=HvwfLN5Lh$f5rRz5EADjdlQYD&=AXi8@h;d~-)c~wR*lyH%Kyy&HI{KugX zO2JnU#t5+}FbA-4zVwv~34hW~9}cowT06Fh4tfqI_qS1kx;EHu4z_imF)g1{j5g@# zY;HqZYL$Xurv?zgA~IGSKT}-Bf$>ViO-*g>fk0I`Dzf#+7!KkVXKodDS|Z!n7C6EG zsLU<@?ngK_V<kI3TZwK|QkWKTm*k{sgU!LAjQ?@sNMi&UaQet+j@i63|9}8mjMMqK z(8JD+{%YTETxj~G)Y44-8<?jjb}|PQ;U{^Yi=M6ac%)41!k5Qko>n5GPXQz3Pxd{B zDZ!Mo>C!`IDXt5GFAa|oZhL6J$t}u))+zKDD-4IO=xyUi7Xw}<>MX1}i#fPUuss`g z3vrwk)QqCkr@gP77=i$+41y-)fRu>Jriz=miQ#DFF@hz6V#Lex-BVT4g7O{QCZz08 z)~Zp!j1~r%&A=(%@+P>~I81T~d~Zw@YS*aqF1RtEAga#=z1>-}=g|^zm@}|#$B4L4 zEY=hejczVNH&AdVQm6}z0Us53(LR0nkxwfSRUIOZHC*ZQ*FglUhe>B+@KDZ=;O3BB zkuBq;fSe*P8miLsbHm>JAn#BPM4jTkV&8D@j2~OOusz7rg1vpLC=k?lf4BbdyB8XR zjW=?2H*!P?Pj0+XSJ(B$u3s||#)G%3aYC>)*xugU-kII`wF{rEU;FOGo87^S-NA;# zU}M8QKdJW_CVk?)b4k5WOLM5Xtx1!56CYnY4R?OptbZ=t357aCp{C}uaOV%Nci(SZ zyMO8a@cRu98$P(1ZTR?7eei1CNZrN1x)7}!8U4-R1Igtznr&-yd2LOtZNZL~^SQiR zU0;kgJaeqJrk2*uP-ka$_|rE(Q~!GI)6t7&%tzloJ33>6&F!rn*~O9T|Kx=W^=}^r zFE@k%=f<yIxbx0)PSQp=2uoK3w?Djoc<;rAo9hkVxg1nnz4n#?`W&u;Cf>`2Uw*Nr z;qc~_hH#+1{%7m>SNFoFAkAm%vp@TH4gcxJ+TxYW^8K5;_Zx3!8}DEJ{)+(<=QSSG zH6Ac*v+<3wFLiyzEWh#Y*&$jwLam*hgo&x8zrJ`o`#=6y{mX&Fp{skJx&7&9LGfv! zWIO!uYQz0(!#$F)*~Tvfe)v@j_sNd!hzdZ427BNnO&k1ec-GA?yFKIH&}wBY?Cg#I zZJ0<J7iYOLxqxOsEh0`lvd7Y5?RyFI^6COI%q=`1f~(BU2b|p_9YioL6DB3zBO2rx zk1TzL(BiS64D86X*=n|e6f7Kx@z|UaK^f=~ZkJ?B9zayrlykk#PEp*J-6MvPyFwfg z6E@&j@=Y4`lob`XGR>E|rS6E@atn{jmdR@|f>_dt;x?cy`oxOzFFq&Ro1~+FYz}Ux zd!58c?LyUKDXfweNpA((aNn4O#Re*3JGT(^zp{rXiFL#qNIrBPSV?cY+nGG^hEw+P zynUP(2#94!?^KTkZ5FfkNlF~8d_wnDOoBG(RC!de({Zq(NMc(t*IN(8aWqI4>TZv* zJ|R^ITNr#x@dsa95?_XzoY^?joP}+BAXVL15ZF72T_A0$-V3<(9o6yua!E2uAX5T{ zWQx<<#T|PT_2r{b7S(C8G+r*JHVHO9R`;+7$8`-SOk7Sjxj*0(P>o$mJLm{;B);98 zl6i>&?<-Reu<N4eV~6F!ggz2uHI#6|iSkk0o|y+<sN0W@7rfF`IX_cLrO77Z7-LAP ztPRR6pR8n*4L=i-6GnS!lX#w4^uopstpPru$pf}67oM=5L^43Pdt&!=3q(ZqsQcq_ zUY!KKL=iutHE440h8flB0)o(J`-mJxSSY8yubBoWs|O@AUwN3QVoNZPTw{s?YiD-A zgKoyGz|-<T7zafM&nOC7D<cw#It%;Aj6IXL>@;zRNQtbivMny9d?W0Xp5}Dq1?`=A zIbC|%?K<+o$(P`ZDz&RD$2w2(kvCNgVEjoB!!yE5hEIf!V_^&KVMe)#5D=0jo|+1t z6c%(80Xdig3O%MM1#>#FoCZzeSDXrsK*0e=BR+#7TSqYhS?@5`6nv|C*2+s`;eaTx z!x%oO9Utj<D;4^OOW(Q}`o=Gh?)1HG*@H=>2a}D_zh}q#g-|mYa?^wd;*%7T=tm8C zYuU=@s<~-`AeO*)lFUY*c<C^Vu7EJ%(}FkR1yqmSH;?8MBqtW^I6AKjFVuoo_R63$ z*Tn;ObH1GK)xd>?p0gj-puUputl5cFfr&1&Y5TZ~5Kj=L)DN$#Ru??PVxmiW`kb+o zD)vkgX9f_g(@-<lZx4!IWJYB=>CS}}7-6N}egqJ<s7!Se?`bWHtq^_umdey%xsWZF zqR>Hh+dd-jG#12oa8(Mb&XZ{|(mW@pq_jk@D+whHPehBXnR1lfy=(UrBtkybI?v)m zs8j0k(kt#PD3x@ZfRLebWV~#X&o=xNcK1l0rvNj+W-7PN_`W@4#f3PcN6@LhSV-iS zo|fGx*^TUo{wE{VKLc~h@&g#b1*riLP<O{5FChi0-adK2>HLomGnECY1gYNSOuoQP zr7Mv^=gG2HO32fhWeamzr3!-rB9<mc<8JIpO2FuJ2qo+|NGwghYfwy^sdBmfPn=nT zefh?jPciIS9Fe~-mzKzxR(b&Ev4lgjy_MOp)N^uaO9E;nuC(MtwguV4$M(aFz-E}Y z5-m!u?mP<%5qrSZJUC%XmDWJ@6cm%B*qBdAiYA_7E6GWOM~DX$(aImfQ-!33S!Ua_ zIgdB&quolX;H9^{gF_|&q+P-^%sizzBtLl&aX0$C?LM*>QXT1Aa*uGbCHJd)fEKKA z0M;<JDM<p7XYLXhnbQdt(Ff5zp#+5qF?4B3H*+dqkR~#c$02KBc<HHy1=+YkGs|#> zDD}<B%4kH+RUD=`=M~p5+)i&oG-~=``FK(GK7HhCJ?j|tL<S`>Vh@ptKjmDipm8B5 zc`kBG<8i6h4t3uX529~T#YOcK9EhpV$Ev0D<n|D`1>bGLaOwc{01*O-HwusBqh&n8 zr>u}TZ^rDqg>F-O%ecucIVprPDFqHvHFM&4Y^^~nRII4)&3W%&hE!KTliX-Vl$H8K z?)tVfA0Z>Cj7q8m_v|rVgcBbYV{77JAlKbsOQ1k^R|DGWqe@CLpD1lMclwV3M}j4` zfh~Iq-oY}?fu2ApmSl@wQ8PWpOX|s!E9eHP&iw307W&$3?(`1g<f8Eve~k5_v#e)8 z<y69~kU)}@d2&W+t>KZ9c6Mf*B}v_>k4n6X4=3Qj#*tA0t=KKXG~e&-ufr4SxVniR zq{^MK_K{v_%r?3)3P5;P{YqtR_=AtEglY9oFc<62;&3^)3a3{hX=h+b^vmh~Je9Ia z?*J?MJ`9;Y{H9)M5~#vk8SfE0GiYs6`xAj)NWdeQD1DD>&wG)L-mrHv;Vq>&3nGqv z>ZdpkET99Q8z<P=U4R$V2C|PJJVTOcSQJx<dj4a+scw4)W^&tG@Ad{PQKQ7~WE*hD zA-R{GbJq$W#XH<<4N?(z`teQ1fxS5>ALUp^<x*upkF$6{unSI1cmpZ~$0yxYG&6#o zDm@8AxH37$0KQS?C?73YDpGvQ!iZE=R*(|AXO7A%)pD|QJbhNRqE0XxZZGN8Jo#a& z#v+U2zZjc9#yFEiY}gNS)6JktgmomDT*!Y0UjzNSgw(LSk{^8IiB-|WRM8Ft<cg++ zE2bZi9E?}S`VlZp5CCZD+s$YBrzJI6KbfODgVG%S2dfL@V++{)%A*pT%r~;`(IF_a zD0?@x0H-Lym7&UN8U}EqTTVt2CSH2kUERP|tNY3<rbwZ@L}h-0Mv2NC?En;GdZk2t zGRH@$69@!mV-*QW$%=E*rTD61%I@7RTSsdo1_eV4dm4^9*&RE!>I~=5i4I*K$t52r z8lF}8+(cPy=HOh5b48^gA_mN)M%4k6QyLEq2Ez=(xB+o&W{(ACtoq6W+hVI;NfoI+ zJEIU7F@-KeoCr}X<$pO>CW#1;cJ`5Y3WZcX;24tmJ#)%2R7?r`guP76C+nxTq2WDe zPD-iw>=WYCeT;hIo?YCk%skHiaQo{1wby>;IT60eQTWM*OrRlr<wE1{*In@%f1ez} zziNEF`wzSRjhQug{Vaz-tvJ}!+1#3n{M?n>+2RZJzi{zx`h~84&A>bF{rA?Ifuu58 zXyyuqQjsYN(r;dU>8~%5H~GS~hQ&+u*S=hL?aOTQ`D?HDHh29FvtN5}r19C=sr&WU z?_~e;VZGHb@nJoEU>{%m!QI^t8gBM9+<RfY;okM{Ub#@8sJoV_Pq127{k^d_qMtOd ztbe1r8jWSGZ7n2lnwvUOJBRhJS#6<jdi_}Ch%-6r?SrO63_>|S+!pGfo60MuU2JA@ z_#RDCs5Iv=?(HAdro-VwckDpLy*5g2ZF>u8CyJkqHxZ%20MFe9_dv43?4}#oJ27fU zEUG)5=5GCz3cekKaZD(^l#iAG2%8v^9wBB(iaDS9Q5z9Gz=twC5a^JM{$m5Vb-;gY zN}PjAXR=dRLNOxr_=zfJl75)lh8-0oakqB55Vl(MIJn2%F{am|2CL(4ZkuhMk08cd zS}jE@I;nMP%gd5~HJfwvT6{K#OwK`bE1-+H0^Vo@w+h&JPXM4h>PI`WYUi`gR357< zz?)i@k7U~3$n*%3anJXU5OO+7Dg<%cl6-({6pSg2v!Y**8{2@$_SA-UozT!CiolS; z9M69$73kcwy1Ft~bi3(|F*MdLfF?qqnsu!M&Ca6HHQyp>`(bkM{bx>zX@bGL<5jG4 zV04RAS+L?JAk!22!5Il_gAvg`hHQMV*<zim;ewPVX}gnN4NHoSN?0!(Q$#p>(qD(C z@?4Is3dX4{Yy}M6edJfkjTmWcp?XSI99;5-HtZZdAsJ^M?=$iXU$wsS%U`X2q!&_{ zcxy;vR*Wg{yEAT@GFil-PbxF!6v$_fFxR6HF`HNKt(Hr~!HsVc{slr!P2{K|%`L(< z5n5XGzji-yb;?JUFp~JP(sd-Y8r(j&ur7XH68?MO?i4W8dQ{dCxBi5B>KN~dp9G5# z{M1pX#g|wanl2;+jH_@lMS_GxmxQK6(xgIHibbAIAn1F9YG)goP>vLBg0Mshw@C%v zR}3J=C>&n-Mu`g5R46q$h6P0?WKjkiZcy9IFt1cZcA+dp#PkyHkv-bNOTp55>k?Z! z6OuU-$ui=jqEstHq=@phy0BJos>)M~LCV3{C@r76Ckf%No-n50OP|;)J<!jpvPo;O zNwO+Zo}DX!31@r6S(x=nY=YE|HW}bVC8y>bthtZ!G)S^u047H^Ylnu~+l0`<8YyXZ z;6P5#h*Zx_onG{TsVzH}sb>2!WH5_w!b;3hcwshoId#Tt29l;`1tnXMUe2MMotb<J zfTrMCl2ovX2bsy++c|bt0t(WB&RWcR2bppysW=7ZGbEZSU=@87fuS#ok|@kM*&|qW z6EpVUx^y~qXF*l19E6%$1y^>FX4wO!G=!E+BYI}APgkJ@FK)qa2)_0<wYAa{1CY4o zIzqtSN$M(sdLx9w$Ku;Os*n#85B($*B(icQR(#-|63_4;4DB=3sa*pNf`Hr_?&*e| z5dA634pOP#7irlk1gdnvff1cy2HC4A?_rFgVrNjnP;YQk-sA?{^p?(7K5!0+^CdB* z`k*P)ViZ72^IHrLbu_CU_}x_dKV168#n3<h61loJIy5m=%iiEdTRGiA$?M%!N%Gda z>=K;|(zsy!uT=Q*wO1_Dx*@=zOQ`0OWYYcx({!QpfxMR2RIhkTtKL4n6Vj&vmL=OV zu@%sS8cNCClIB#r!~X|!KnL5R3TVuzT2V4UBsbK15pFh(RJ8+;hUeQ@$%(@(yL%*l ztK6vGML?1zeUGFjE{(~?iLmoqVttFhDjl!7H{=!WGrinA)3bZUCkQH!T3h{w0uB_2 ze<k{p#<i@ie)(;P>X1&{Z^YZXd87V=Y~zQo{N=^lUo}2rZB0%tC2%XDT9f<QVfaSl z{cPiRZZ%%H6TN4~FI+zJ0$W?#JK9^@T2n!q9J62eueUzDyiZ9EpR%E|{`Q}l@#h*U zn}Y4Z&i2-thRXV@YxiFK&f)zFjW-V(Z~pLG%)OASum482_;(k6DRAZMW{u>FsnEAC zzMOsY2N%9~k^0=t@87j(!>p48pr6Aexfzqll7!6m5!DquDR+~CM8Dg2NWeig!;Ngh z+a6-M*2R0VA)_?1l)Pe55(*MJM!fmFvp8B=e&Ft}^Nm;-lTRY-B=rvMMEP%zQy#PU z^R}gthxn|V8^fZ}M^9;5Hqo6?nGB~V-l5?pzs6#kF?Qyb$_3G;UTKn~og}XnGzd$S zL%ArI^Rr|Ic7k=z#Ik5C1<>hVw|oc}qylblciWjr)r#pyY3Mp_$|Ce0bPoE3Q34e# z(jYD3GrFrd&Kb-JxsM)@M#e_YG*1U*R&fiIm_^ZwWH}8sWCe?6R;~l+aY4$%SS2=y z(6S2%Vq?R&lB2`QQ&FN??<Dt?7=py~iCZLu1s)jNkLeQRVsPqKNLJ|ioy4+BtEx$! zYYq`)8%Hl;(QJemu!~<NG{-Q8(VT=RIkk_Xc4lWx<thyH?(rjcVT(=0dgOAnB|CG` z(;jbW3?!CZ>@mIWy>c~Vv4@fovAP^WmD0*%tffS*k1fP<MLgVrUN=U%sE8Wy;*U}1 zFqR}c*Qr2_81Pm}-HW^0LJSjX6Lq&b@5L9S9XQtunssvJAzvQJR`hI@#weWOApioc zyRwRLq1jwAYE%QW7WK1kf(*@rs`sD`-Y}+cVrBBsNgpU4SwOT#jO@e)wG56#4qpfG z2%M4$A{pkxiL7Xep*k9<45<QD$Zcz82G9j)x*|+USGtg^@YU=f{YUu$Xj(g1j&!uZ z6lsb#kP-$l<_s67%L?p@*Wt<X(WGQIsT<5%8(wik0tLyeNXjJ8h8hLK3aEJ9#SPNB zE}dqlTzv+8E%0W|X0p1~ZP>uq`lsiCDUFhb@IbSJ`j>n(#D<QDfe?|LVQEN-h(uPm z@}C<1;AsFFi&CKfBwa$P=|)ltNH=wrEzF8Ox(s-*FnHEO5ldqoEWuztVG72|E|^Pq zD5}SIwnT`;`#y=VU3F!o{)}p7W!Zi4vAdo@3?;W$nzEJ2nVThYqo#_pN2W+Dk>)AA zu1DU!wLpeOneS=dG^l3|Izc%N2&BhoEmAI)?BQZSVPr!NBzG-NPJG(QK;@|17TZ+` zLe81llF3X+X;;QrpaTuweL$jGTxnC9lfw+eoc<@BbXp3h48`fxqUj5L#<NyYEcq4D zxu=+K7Bt1X$z(bsE4(tQ385B7d?FcQ!`Z;g`SqBml5W+A`YC%-yK)N)wCbYWq==b= z839g$-|A8HDhG`T^dZQjQdDDM5>Ax@9f1p8-;-07B#93(`=@+SOFhdMI>Xx{5h~fH zDgt|o9zq$@BS!EXm&lL0yL>6SOITQ*F0a*oBUm&&xQZ|EY@TjDCDejrRH79hqJAh% zwm!d@h0%#1OC28uKDDPdVTl478unKmTYw?J36+WJO_XQglGZ?-MRE{M63cAW%z2hR zh}ju^q*FRYgfZ-GgyH5hnmnVlC6!KXBfTOSPjiunVja;mnZYVed}?6-YROSmflmvA zQH0`*oaRO(IE>3}DouS#x9^JSnH?DBd-GAsE>3@pHC2xN#RrI5R*)BwfGI0(cRxXX zRK{_80FVTNR715+qPkMz6gmee4dT)kmgLHzDI&ra;6aj9OJTlIyR6@cIAEaZ);6^a z60=+h^CkUE9IUwj+nqC+WdE_Yz;G%fLIU6sD>oP+*u)zX%gap`Nec{15<fwgnApZn zH1~M{tDzqr(Q#6m3ww{mPciO*=s&M#PmF(dm(Wkj%cbGwAZvj=1c$YiRDuG085)R; z9H~M!*J=E3)Ei<9OEQb`aEZ)9y-QCP!eq=oeKqtZ=g~FO_KJ1xnUGU1=w#I5=!E7| zgB0=A+uw%Q_y=g`jXAeJ?3oj{Fi95)3!&ZE86hYi=MWS)!Yj=}5x%6CSPy3xEHU9# z(S&k8!cP%5g+;Nn;0=oYC?qSLaFW0!M1mFF@v}Qbr{0iRr;ta#S<=PbM=x?7Ex`7^ zz?qLyT2O^gM-1(lc%teQDe!clzzhIY1z(@S9^566Vo2N=n;OLl)=>cE+`bqcZuqTC zTh&{MQrE<zKQ+||NVyQE^@vt2fs3rb=|{j)7m4~Dah$NbHYpY8Wb;X5IGkoJ{6nud zV`n9W2REUdB-k)svN2iKiLedsD%YxxVjRXx=K!k;h8{@`foeA49I6mz768Lwn$MQ% zswvh|!*}}nl#WU!rpSHREEjAZR9DvG4DEs@2Q@MffK@2000QQfRl%XTU}@f%bgUkl zQY%WpeH?KY#l;skmwpJu&b&G8ZSkE*?b0$+=(<M(_)?`rDHbh(twzaKRj2CBN_5W2 zBAxyPNL3%zBcQj$do#LGjFna+jo&Q3S5aJ5@qWpME0Rw;=KUMU+^Vtg_jktCbp$k6 zCLqjfT8ChEkP2}$w!wMxxP5u*g5iN-{QNJ^d2L-)D=h}c#E=T9N;OE+_(#rR;fEU& zq!v}osk!Z}(V+~*t0rnQY#G31R~L?0!I_w1v|FIFT#O~mg_gHhOemw+EujhI8W|ji z?GpNMLr0Aj7kG<BiMlM(H0*i)u{cI?E;mt4<JOli@nSGn2H_2@Kt6X0gPox*mTs^0 zgKHeM(*&U305H2@(K7B$$+c9FTqUeX7`N_s#_=$SJ`9`XtMQlK)>V`~g*>QTMLvrl zJ+Qf;$@unMdXlCiOr^QnD*^Pq0+*A-EhC^cyru?POi~)%(i=FufBn6evhVHO{EPod z*5lRBUA+A*atWV!@9(cw^A|xfC~cIo+EYs-*YE88Q8fNS{TpkUm+tL;SK5HT`_d0z zyj1@}ciju!RKouB1u5VCOS8tg^4;kgr{%jZT|8*G)%o+cfBc#BNA*`9&hC@e|M}Yc zw{9N3ACP|U@U8j}FW0|ICwBe2?D&)QcUNzA{kB==FF#)c^=Tc@l-_Pz{QdzYl)f1x z2`vt-!OA^RAl<Y_%~eL0K_a;Ucb#6Jq!U>dUwMEGCdPP{%iNHMl8wQn*4aX;mn1)W zko7H~@h)|)Y?kZNG=;33`m3?V8~>~-iRU&p&6u=al6ynjYNwQ3MK>20U<5n+h}K8F zn=o<r`m0tz*TG^CbJF)Aym0O^Td$nKB%3(~Y$B}9CI945CA<LZVI^QX5yi11Rl#Bd zOqSev7^crbJ`$I#aFI?r`^5&tON366JSpTYhLXGRTZ!o4VNpfc7PozJxmrw$n3Xmw zkDky3P?@LQZkd=}s`DvRktZ2>xF^`tWF%V76uc#jBUMWtg^PT~l0pSi<Zw2av#K-` z8$6Mb%jk2<MoU0SF0X?O0Tz!czb;j}u3j0nJnu#qG>K16g!7SG(QA1KzYG}KW>Ykc zlu@hXJ!25EV8G(m)!nW8^W3Mqe#@|2ZDO$1XSvg6{kFp|kg&@(e(>`5?@kO|ZEU=> zS9fU-o_%!b?#P#;KfwZb-dj0qly7Qn?FhAWwzY)X^+I&$o6a0Po3wUkxa0(nIutAB zhRDEs#dRVm$~)*iKKfmJ83~gF<9OoIsAMQ(eucm%?${C3)WQwtGzV=~K9<_)VBG8L zk{=vcc_i#HYsr>+8&w)M*2lOtOnZYc$M8e?21NN+;rmBIX4*8wv(@{vwC|bnuD_0f zo4o^=7JJT;c2sf6=@^qsd$^kt3i4bWP;>cPyo{2$8%|S=@t1i@cnMmxlR^$MnV?5b z9DKRhDo@=lX$ls${SnBKEpmBFk**j}APD{gQm3HM$?f0$C3is6bLU2e32>#mN}3Ue z=)}Q+DyWzd?0O^pcC6R!0_ZM)fKJd%Eu6mRkODM_wF>)iru5qlJ+YINR=fW{^oJ>^ z!zz&tex02jO1vy5b{0jFU-vR`(*oH?s*-GqyjPDa!}ZS^R+6PGe=X_@0d8U;@|4}} z^bztZ`7FRO%`JL^TkLFNnPzm~S{ufYmh2wsO@EgEa6hPZvXl~1Z0qR-Oe~{lw2~F$ z)*`N>-?#lVcs_J0!iHglUyyftU_qMZ7QsH>NQMxv)MyedAg|&?sxLx;&_3EDHz<6K zbD1BKLP<Df$1M3uv(b9-l0kyY8h6pxl5+NSbfxAt;7$qpS0-@mIpEpxxrI<1)hW8G z{??5kleWwl^;k37eIh>(aZdi^RNB#LJUY<BfmL*PQ_Jvjijxz`kj8entHI`#v1yoJ z^dK@$QUV1co*gM(K~1%3LKEXvF@ZPvL4-~H7H2ac+3wT(5GptNQw#Ds?k5ygj1@F? zeukr-Az4g1)YTl2ss|#C;)U(EdMOtZEn@rRcUojQ%mcP0yubm0OO82?V@{K#1j>+H z4yyu#>uk)KLX43Xssoqi&{agF5Hd(MUov+RKwxmr_c*+*L}I26^>Jb7pkIPNyKgV` z{^uKi%wl!@Sr=k1x&4BZr;@lz^PrIdn2X<M;c_DXaa2k9V}H#@g$`F(Z?K5T&L>&I zFSW;{wy3_OY~0sJD=WHMzri}7*(B6=3H{@nNEW|gL?u9e$mV3!>@>>n5`a-8S#7^C z4Ht3?w7`ot^fH8#BZ4Cu_+Tr}^`mI|tO}B=gxkj!ZLTy^2<>}I<Yf>-33A}~{f7`$ zmNxEDRV+D?)JjaA0vM@6sMCTBph9FFQ#4Q{s*uy*Ael<aolA26CqNsvv>InuiGk$& zPKsZJ+J#4WWeusun1VbF5VSH<n37eEHBg|~RvT}nDl;T`Nc9ob1?M9tfGsqZG|1sc zb$Z!F$`hLpW#c}6VMZ=+Id`;S(^w_9a2#Mg|IR_*jNH-q;KDC_-CWV|Wu!Ez+S7NI z5_0iM)_EA_+A6wLx)5WOs>LZ~BVZ5!_;ssUL}tc`k*zbAz5C=BLO4GTl_a6karC^- zLi*gi<iYeRyL9kIH@shrNJ`i!=~|-&9z?GhED0WH&gZJ?{(>M115snE)QDt%kbL>d z7pOzZMa0h<SLpneO1*t~b`kMUt`jS~{QVaj_+glzUvId130MB+EA{VZ8#7na*KWVd z5>H>?>}=}jY;E&z0^WS(z548X-+tvUZupx9pR=jXINOxlk3+9q9%}qW%Al`jU%2<` zi;cf=h4=qKJ_E>?`TG*@|CGN?-hYn&zsBFM^Eb=iE&i_ZC%;q|;qPzw<CbcDC;xxU z-wJ<n1@blhtA8u|$8Wtl@YCxJTvA<c_~2C#^x;eWm+y4FZ;rQUoy8|D9j#66?Jcd@ V%<#)!yE5^emnb`DUj4zh{x7cl*46+3 diff --git a/func/collocation/collocation.py b/func/collocation/collocation.py index c003be7..185024a 100644 --- a/func/collocation/collocation.py +++ b/func/collocation/collocation.py @@ -7,11 +7,25 @@ from nltk.metrics import TrigramAssocMeasures from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures from db.db_config import get_db - +from googletrans import Translator +import asyncio def run_collocation_on_text(page): - datasetid = page.split('><p>')[0].replace('<div id=', '').replace('"', '').strip() - print('dataset id in run_collocation_on_text: ',datasetid) + translator = Translator() + datasetid = page.split('__')[0] + inputstring = page.split('__')[1] + #detectlanguage = asyncio.run(translator.detect(inputstring)) + #print('detected language: ',detectlanguage) + inputstringCh = asyncio.run(translator.translate(inputstring, src='en', dest='zh-cn')).text.strip() + #inputstringCh = translatech(inputstring).text.strip() + print('inputstring original: ',inputstring) + print('inputstringCh translated: ',inputstringCh) + print('-----------------------------') + + #datasetid = page.split('><p>')[0].replace('<div id=', '').replace('"', '').strip() + #print('dataset id in run_collocation_on_text: ',datasetid) + + collocations = [] nlp = spacy.load('zh_core_web_sm') @@ -61,8 +75,11 @@ def run_collocation_on_text(page): #allscores = scoredbigrams+scoretrigrams for item in scoredbigrams: itemstr = " ".join(i for i in item[0]) - if '部' in itemstr: - itemstrnew = itemstr.replace('部','').strip().replace(' ','') + #if '部' in itemstr: + if inputstringCh in itemstr: + #itemstrnew = itemstr.replace('部','').strip().replace(' ','') + itemstrnew = itemstr.replace(inputstringCh, '').strip().replace(' ', '') + #print('itemstrnew: ',itemstrnew) #translation = translate(itemstr.replace('部','').strip()).text.lower() #print(translation) #print('--------------') diff --git a/func/concordance/concordance.py b/func/concordance/concordance.py index 71e7c12..166cfb5 100644 --- a/func/concordance/concordance.py +++ b/func/concordance/concordance.py @@ -5,10 +5,12 @@ from spacy.matcher import PhraseMatcher from db.db_config import get_db from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures +from googletrans import Translator +import asyncio -def collocations(datasetid): +def collocations(datasetid,word): collocations = [] nlp = spacy.load('zh_core_web_sm') @@ -49,7 +51,8 @@ def collocations(datasetid): # allscores = scoredbigrams+scoretrigrams for item in scoredbigrams: itemstr = " ".join(i for i in item[0]) - if '部' in itemstr: + #if '部' in itemstr: + if word in itemstr: itemstrnew = itemstr #translation = translate(itemstr).text.lower() # print(translation) @@ -66,8 +69,14 @@ def collocations(datasetid): def run_concordance_on_text(page): - datasetid = page.replace("<p>Collocations for the word '部' (department) for ",'').replace('</p>','').strip() + translator = Translator() + datasetid = page.replace('<p>Collocations for','').replace('for dataset','').replace('</p>','').split()[1].strip() + word = page.replace('<p>Collocations for','').replace('for dataset','').split()[0].strip() + + wordch = asyncio.run(translator.translate(word, src='en', dest='zh-cn')).text.strip() + print('datasetid inside run_concordance_on_text: ',datasetid) + print('word inside run_concordance_on_text: ', wordch) #page = page+'部' nlp = spacy.load('zh_core_web_sm') conn, cursor = get_db() @@ -81,7 +90,7 @@ def run_concordance_on_text(page): data.append([docid, content]) concordances = [] - terms = collocations(datasetid) + terms = collocations(datasetid,wordch) #terms = [page] for i in range(0, len(data)): diff --git a/func/translation/translation.py b/func/translation/translation.py index deeca7a..bf74b33 100644 --- a/func/translation/translation.py +++ b/func/translation/translation.py @@ -2,6 +2,7 @@ import html_to_json from shared.translate import translate + # Translate text def run_translation_on_text(page): #print('page from translation.py: ',page) -- GitLab