From 470877e3e811b7ef5c2cfe07815fcf75a44d2d16 Mon Sep 17 00:00:00 2001 From: Tom Edwards <edwardstj1@cardiff.ac.uk> Date: Mon, 3 Feb 2025 18:37:34 +0000 Subject: [PATCH] build for DG --- db/pdf_extractor.py | 212 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 212 insertions(+) create mode 100644 db/pdf_extractor.py diff --git a/db/pdf_extractor.py b/db/pdf_extractor.py new file mode 100644 index 0000000..2ce3ffa --- /dev/null +++ b/db/pdf_extractor.py @@ -0,0 +1,212 @@ +#extracting text from PDFs +from tika import parser +import os +from pathlib import Path + + +#cleaning documents +import pandas as pd +from os import listdir +from datetime import datetime +import click + +#db connection +from db.db_config import get_db + +conn, cursor = get_db() + +def skip_empty_lines(f): + # Skip any empty lines + for l in f: + if l.strip(): + return l.strip() + + +def skip_irrelevant(f): + # Skip the white spaces and "Page X of Y" + res = skip_empty_lines(f) + + while res.startswith("Page"): + res = skip_empty_lines(f) + + return res + +def read_skipping_header(f, top_title=None): + # This reads the text skipping the "Page X of Y" and the next empty line. + # If top_title is given and matches the line after that, that is also skipped and the next empty line + + l = next(f, None) + while l is not None: + l = l.strip() + + if l.startswith("Page "): + l = next(f, None) + while l and l.strip(): + l = next(f, None) + + if top_title: + + l = next(f, None) + while l and l.strip(): + l = next(f, None) + + continue + + if l is not None: + yield l + else: + break + + l = next(f, None) + +def parse_document(filename): + res = { + "filename": filename + } + + try: + with open(filename, 'r') as f: + res["initial_title"] = skip_irrelevant(f) + + + f = read_skipping_header(f, res["initial_title"]) + + # There always seems to be a header (which may be multiple lines, possibly including empty lines), + # a source (which seems to consistently be one line) and a publication date (also one line). + + # I choose to read all three of them together, as the date seems to be the only concrete end point. + res["pub_date"] = None + tmp = [] + while res["pub_date"] is None: + l = skip_empty_lines(f) + + # Eventually, we should find the date (this should only happen after header and source have been found) + try: + # June 6, 2024 Thursday + res["pub_date"] = datetime.strptime( + ' '.join(l.split(' ')[:3]), + # Format is inconsistent, with some cases having full time. I only get up to day and discard the rest + "%B %d, %Y" + ).strftime("%Y-%m-%d") + + # If the above succeeds, we should have found both the header and source + assert len(tmp) >= 2, "We have not found enough lines for both header and source" + + res["header"] = ''.join(tmp[:-1]) + res["source"] = tmp[-1] + except ValueError: + tmp.append(l) + + l = skip_empty_lines(f) + res["copyright"] = "" + while not l.startswith("Length: "): + res["copyright"] += l + '\n' + + l = skip_empty_lines(f) + + res["other_top"] = "" + while l != "Body": + if l: + if ": " in l: + k, v = l.split(": ") + + res[k] = v + else: + res["other_top"] += l + '\n' + + l = skip_empty_lines(f) + + body = [] + for l in f: + if l == "Classification": + break + + body.append(l) + + res["body"] = "\n".join(body) + + res["other_classification"] = "" + l = skip_empty_lines(f) + while l != "End of Document": + if l: + if ": " in l: + k, v = l.split(": ") + + if k == "Load-Date": + v = datetime.strptime( + v, + "%B %d, %Y" + ).strftime("%Y-%m-%d") + + res[k] = v + else: + res["other_classification"] += l + '\n' + + l = skip_empty_lines(f) + except Exception as e: + print(e) + print(filename) + + return res + +def getinputfiles(dir): + files = os.listdir(dir) + return files + +def savetotxt(file, content): + file1 = open(file, "w") + file1.write(content) + file1.close() + +def simpleextraction(f): + outdir = 'out/' + raw = parser.from_file('files/' + f) + location = os.path.dirname(os.path.abspath('files/' + f)) + rawcontent = raw['content'] + outname = f.split(".") + title = outname[0] + file = outdir + outname[0] + '.txt' + savetotxt(file, rawcontent) + parsedtxt = parse_document(file) + + pubdate = parsedtxt['pub_date'] + pubyear = parsedtxt['pub_date'].split('-')[0].strip() + websource = parsedtxt['source'] + cleanedcontent = parsedtxt['body'] + loaddate = parsedtxt['Load-Date'] + res = { + "title": title, + "location": location, + "pubyear": pubyear, + "pubdate": pubdate, + "websource": websource, + "loaddate": loaddate, + "cleanedcontent": cleanedcontent, + "rawcontent": rawcontent + } + return res + +def savetodb(res): + cursor.execute("INSERT INTO news VALUES ('"+str(res['id'])+"','"+res['title']+"','"+res['location']+"','"+res['pubyear']+"','"+res['pubdate']+"','"+res['websource']+"','"+res['loaddate']+"','"+res['cleanedcontent']+"');") + conn.commit() + conn.close() + +def main(): + inputdir = 'files' + dsstore = Path("files/.DS_Store") + if dsstore.exists(): + os.remove("files/.DS_Store") + + files = getinputfiles(inputdir) + + counter = 0 + for f in files: + res = simpleextraction(f) + res['id'] = counter + savetodb(res) + counter = counter + 1 + + + +if __name__ == '__main__': + main() \ No newline at end of file -- GitLab