From 61448a2521fccb3e11da17f2da9bf2103ac689d2 Mon Sep 17 00:00:00 2001 From: Aldarien Date: Tue, 2 Nov 2021 22:12:25 -0300 Subject: [PATCH] Python working --- python/Dockerfile | 7 ++----- python/src/app.py | 33 +++++++++++++++++++++++++-------- python/src/contabilidad/pdf.py | 34 ++++++++++++++++++++++++++-------- python/src/main.py | 8 +++++--- 4 files changed, 58 insertions(+), 24 deletions(-) diff --git a/python/Dockerfile b/python/Dockerfile index 4f0b04e..1eedf44 100644 --- a/python/Dockerfile +++ b/python/Dockerfile @@ -2,17 +2,14 @@ FROM python RUN apt-get update -y && apt-get install -y ghostscript python3-tk libgl-dev -RUN pip install flask pyyaml pypdf4 gunicorn camelot-py[cv] +RUN pip install flask pyyaml pypdf4 gunicorn camelot-py[cv] pikepdf WORKDIR /app COPY ./src/ /app/src/ -#ENTRYPOINT ["/bin/bash"] - EXPOSE 5000 WORKDIR /app/src -CMD ["python", "app.py"] -#CMD ["gunicorn", "-b 0.0.0.0:5000", "app:app"] +CMD ["gunicorn", "-b 0.0.0.0:5000", "app:app"] diff --git a/python/src/app.py b/python/src/app.py index 707de9b..5722eb2 100644 --- a/python/src/app.py +++ b/python/src/app.py @@ -1,5 +1,8 @@ +import io import json import os +import sys + from flask import Flask, request import contabilidad.pdf as pdf @@ -22,16 +25,30 @@ def pdf_parse(): output = [] for file in data['files']: filename = os.path.realpath(os.path.join('/app/data', file['filename'])) - texts = [] + t = file['filename'].split('.') + temp = os.path.realpath(os.path.join('/app/data', t[0] + '-temp.pdf')) for p in pwds: - obj = pdf.get_data(filename, p) - print(obj) - obj = pdf.get_text(filename, p) - if obj is None: + if not pdf.check_password(filename, p): continue - text = th.text_cleanup(obj, file['filename']) - texts.append(text) - output.append({'filename': file['filename'], 'text': texts}) + pdf.remove_encryption(filename, p, temp) + obj = pdf.get_data(temp) + outputs = [] + for o in obj: + out = json.loads(o.df.to_json(orient='records')) + if out[0]['0'] == 'FECHA': + for i, line in enumerate(out): + if 'FECHA' in line['0'] or 'ACTUALICE' in line['0']: + continue + if line['0'] == '': + spl = line['1'].split(' ') + else: + spl = line['0'].split(' ') + line['0'] = ' '.join(spl[:3]) + line['1'] = ' '.join(spl[3:]) + out[i] = line + outputs.append(out) + os.remove(temp) + output.append({'filename': file['filename'], 'text': outputs}) return json.dumps(output) diff --git a/python/src/contabilidad/pdf.py b/python/src/contabilidad/pdf.py index 5e8ecc7..15bf46a 100644 --- a/python/src/contabilidad/pdf.py +++ b/python/src/contabilidad/pdf.py @@ -1,7 +1,30 @@ -import sys - import camelot import PyPDF4 +import pikepdf + + +def is_encrypted(filename): + with open(filename, 'rb') as file: + reader = PyPDF4.PdfFileReader(file) + return reader.getIsEncrypted() + + +def check_password(filename, password): + if not is_encrypted(filename): + return True + with open(filename, 'rb') as file: + reader = PyPDF4.PdfFileReader(file) + status = reader.decrypt(str(password)) + if status == 0: + return False + return reader.getIsEncrypted() + + +def remove_encryption(filename, password, new_name): + pdf = pikepdf.open(filename, password=str(password)) + if '.pdf' not in new_name: + new_name += '.pdf' + pdf.save(new_name) def get_pdf(file, password=''): @@ -24,10 +47,5 @@ def get_text(filename, password=''): return texts -def get_data(filename, password=''): - with open(filename, 'rb') as file: - reader = PyPDF4.PdfFileReader(file) - print(reader, file=sys.stderr) - if password != '' and reader.getIsEncrypted(): - return camelot.read_pdf(filename, password=str(password), pages='all', output_format='json') +def get_data(filename): return camelot.read_pdf(filename, pages='all', output_format='json') diff --git a/python/src/main.py b/python/src/main.py index 4bc2571..229b132 100644 --- a/python/src/main.py +++ b/python/src/main.py @@ -7,16 +7,18 @@ import contabilidad.text_handler as th def main(args): filename = os.path.realpath(os.path.join(os.path.dirname(__file__), '..', 'data', args.filename)) - obj = pdf.get_data(filename, args.password) - print(obj) + temp = os.path.realpath(os.path.join(os.path.dirname(__file__), '..', 'data', args.temp_filename)) + pdf.remove_encryption(filename, args.password, temp) + obj = pdf.get_data(temp) obj = pdf.get_text(filename, args.password) text = th.text_cleanup(obj, filename=str(args.filename)) - print(text) + os.remove(temp) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-f', '--filename', type=str) parser.add_argument('-p', '--password', type=str, default='') + parser.add_argument('-t', '--temp_filename', type=str) _args = parser.parse_args() main(_args)