diff --git a/api/Dockerfile b/api/Dockerfile index fc22ef5..cbf55eb 100644 --- a/api/Dockerfile +++ b/api/Dockerfile @@ -1,8 +1,8 @@ FROM php:8-fpm -RUN apt-get update -y && apt-get install -y git +RUN apt-get update -y && apt-get install -y git libzip-dev zip -RUN docker-php-ext-install pdo pdo_mysql +RUN docker-php-ext-install pdo pdo_mysql zip COPY --from=composer /usr/bin/composer /usr/bin/composer diff --git a/api/common/Service/PdfHandler.php b/api/common/Service/PdfHandler.php index 58f7e39..4f9e1ab 100644 --- a/api/common/Service/PdfHandler.php +++ b/api/common/Service/PdfHandler.php @@ -24,7 +24,7 @@ class PdfHandler { $output []= ['filename' => $file->getBasename()]; } $response = $this->client->post($this->url, ['json' => ['files' => $output]]); - !d(json_decode($response->getBody())); + $output = json_decode($response->getBody()); return $output; } } \ No newline at end of file diff --git a/api/public/uploads/pdfs/BICE-CC-2021-06.pdf b/api/public/uploads/pdfs/BICE-CC-2021-06.pdf deleted file mode 100644 index 3542b89..0000000 Binary files a/api/public/uploads/pdfs/BICE-CC-2021-06.pdf and /dev/null differ diff --git a/api/public/uploads/pdfs/BICE-CC-2021-07.pdf b/api/public/uploads/pdfs/BICE-CC-2021-07.pdf deleted file mode 100644 index 5864d8b..0000000 Binary files a/api/public/uploads/pdfs/BICE-CC-2021-07.pdf and /dev/null differ diff --git a/api/public/uploads/pdfs/Scotiabank-CC-2021-07.pdf b/api/public/uploads/pdfs/Scotiabank-CC-2021-07.pdf deleted file mode 100644 index 42fbeee..0000000 Binary files a/api/public/uploads/pdfs/Scotiabank-CC-2021-07.pdf and /dev/null differ diff --git a/api/public/uploads/pdfs/Scotiabank-CC-2021-08.pdf b/api/public/uploads/pdfs/Scotiabank-CC-2021-08.pdf deleted file mode 100644 index ac7de93..0000000 Binary files a/api/public/uploads/pdfs/Scotiabank-CC-2021-08.pdf and /dev/null differ diff --git a/api/public/uploads/pdfs/Scotiabank-CC-2021-09.pdf b/api/public/uploads/pdfs/Scotiabank-CC-2021-09.pdf deleted file mode 100644 index b2ba2fb..0000000 Binary files a/api/public/uploads/pdfs/Scotiabank-CC-2021-09.pdf and /dev/null differ diff --git a/python/.gitignore b/python/.gitignore new file mode 100644 index 0000000..b42097e --- /dev/null +++ b/python/.gitignore @@ -0,0 +1 @@ +**/__pycache__/ \ No newline at end of file diff --git a/python/Dockerfile b/python/Dockerfile index a65d664..4f0b04e 100644 --- a/python/Dockerfile +++ b/python/Dockerfile @@ -1,8 +1,8 @@ FROM python -RUN apt-get update -y && apt-get install -y default-jre +RUN apt-get update -y && apt-get install -y ghostscript python3-tk libgl-dev -RUN pip install flask tabula-py pyyaml pypdf4 gunicorn +RUN pip install flask pyyaml pypdf4 gunicorn camelot-py[cv] WORKDIR /app diff --git a/python/data/Scotiabank-CC-2021-10.pdf b/python/data/Scotiabank-CC-2021-10.pdf new file mode 100644 index 0000000..8fef2f3 Binary files /dev/null and b/python/data/Scotiabank-CC-2021-10.pdf differ diff --git a/python/src/app.py b/python/src/app.py index 3f03046..707de9b 100644 --- a/python/src/app.py +++ b/python/src/app.py @@ -5,6 +5,7 @@ from flask import Flask, request import contabilidad.pdf as pdf import contabilidad.passwords as passwords import contabilidad.log as log +import contabilidad.text_handler as th app = Flask(__name__) @@ -18,17 +19,21 @@ def pdf_parse(): data['files'] = [data['files']] password_file = '/app/config/.passwords.yml' pwds = passwords.get_passwords(password_file) - texts = [] + output = [] for file in data['files']: filename = os.path.realpath(os.path.join('/app/data', file['filename'])) + texts = [] for p in pwds: + obj = pdf.get_data(filename, p) + print(obj) obj = pdf.get_text(filename, p) if obj is None: continue - print(obj) - texts.append(json.dumps(obj)) - return json.dumps(texts) + text = th.text_cleanup(obj, file['filename']) + texts.append(text) + output.append({'filename': file['filename'], 'text': texts}) + return json.dumps(output) if __name__ == '__main__': - app.run(host='0.0.0.0') + app.run(host='0.0.0.0', debug=True) diff --git a/python/src/contabilidad/__pycache__/log.cpython-310.pyc b/python/src/contabilidad/__pycache__/log.cpython-310.pyc index 67fb583..9326649 100644 Binary files a/python/src/contabilidad/__pycache__/log.cpython-310.pyc and b/python/src/contabilidad/__pycache__/log.cpython-310.pyc differ diff --git a/python/src/contabilidad/__pycache__/passwords.cpython-310.pyc b/python/src/contabilidad/__pycache__/passwords.cpython-310.pyc index 756ef7d..67159fb 100644 Binary files a/python/src/contabilidad/__pycache__/passwords.cpython-310.pyc and b/python/src/contabilidad/__pycache__/passwords.cpython-310.pyc differ diff --git a/python/src/contabilidad/__pycache__/pdf.cpython-310.pyc b/python/src/contabilidad/__pycache__/pdf.cpython-310.pyc index a530395..a749126 100644 Binary files a/python/src/contabilidad/__pycache__/pdf.cpython-310.pyc and b/python/src/contabilidad/__pycache__/pdf.cpython-310.pyc differ diff --git a/python/src/contabilidad/__pycache__/pdf.cpython-39.pyc b/python/src/contabilidad/__pycache__/pdf.cpython-39.pyc index e9b6584..f238dc5 100644 Binary files a/python/src/contabilidad/__pycache__/pdf.cpython-39.pyc and b/python/src/contabilidad/__pycache__/pdf.cpython-39.pyc differ diff --git a/python/src/contabilidad/pdf.py b/python/src/contabilidad/pdf.py index de9f7c2..5e8ecc7 100644 --- a/python/src/contabilidad/pdf.py +++ b/python/src/contabilidad/pdf.py @@ -1,11 +1,13 @@ +import sys + +import camelot import PyPDF4 -import tabula def get_pdf(file, password=''): reader = PyPDF4.PdfFileReader(file) if reader.getIsEncrypted() and password != '': - status = reader.decrypt(password=password) + status = reader.decrypt(password=str(password)) if status == 0: return None return reader @@ -16,16 +18,16 @@ def get_text(filename, password=''): reader = get_pdf(f, password) if reader is None: return None - print(reader.getPage(0).extractText()) texts = [] for p in range(0, reader.getNumPages()): - print(p) texts.append(reader.getPage(p).extractText()) - return "\n".join(texts) + return texts def get_data(filename, password=''): - if password == '': - return tabula.read_pdf(filename, pages='all', output_format='json') - else: - return tabula.read_pdf(filename, password=password, pages='all', output_format='json') + with open(filename, 'rb') as file: + reader = PyPDF4.PdfFileReader(file) + print(reader, file=sys.stderr) + if password != '' and reader.getIsEncrypted(): + return camelot.read_pdf(filename, password=str(password), pages='all', output_format='json') + return camelot.read_pdf(filename, pages='all', output_format='json') diff --git a/python/src/contabilidad/text_handler.py b/python/src/contabilidad/text_handler.py index 193f2b8..27690ad 100644 --- a/python/src/contabilidad/text_handler.py +++ b/python/src/contabilidad/text_handler.py @@ -1,3 +1,90 @@ -def text_cleanup(text): - lines = text.split("\n") +def text_cleanup(text, filename: str = None): + if isinstance(text, list): + output = [] + for t in text: + output.append(text_cleanup(t, filename=filename)) + return output + if filename is None: + return text + if 'bice' in filename.lower(): + return bice(text) + if 'scotiabank' in filename.lower(): + return scotiabank(text) + return text + + +def bice(text): + lines = text.split("\n\n\n") print(lines) + return text + + +def scotiabank(text): + words = text.split("\n") + output = [words[0]] + output = output + extract_from_to(words, 'No. CTA.', end='VENCIMIENTO LINEA DE CREDITO', line_length=3) + output = output + extract_from_to(words, 'VENCIMIENTO LINEA DE CREDITO', + end='NOMBRE EJECUTIVO: LILIAN AVILA MANRIQUEZ', line_length=2) + output = output + extract_from_to(words, 'NOMBRE EJECUTIVO: LILIAN AVILA MANRIQUEZ', end='SALDO ANTERIOR', + line_length=1) + output = output + extract_from_to(words, 'SALDO ANTERIOR', end='FECHA', line_length=4) + output = output + extract_data(words, 'FECHA', end='ACTUALICE SIEMPRE ANTECEDENTES LEGALES, ', line_length=6, + merge_list=[['DOCTO', 'No.'], ['SALDO', 'DIARIO']]) + [print(li) for li in output] + return text + + +def extract_from_to(word_list, start, line_length, end: str = None, merge_list=None): + if end is not None: + return extract_by_line(word_list[word_list.index(start):word_list.index(end)], line_length, merge_list) + return extract_by_line(word_list[word_list.index(start):], line_length, merge_list) + + +def extract_by_line(word_list, line_length, merge_list=None): + if merge_list is not None: + word_list = merge_words(word_list, merge_list) + output = [] + line = [] + for k, w in enumerate(word_list): + if k > 0 and k % line_length == 0: + output.append(line) + line = [] + line.append(w) + output.append(line) + return output + + +def merge_words(word_list, merge_list): + for m in merge_list: + i = word_list.index(m[0]) + word_list = word_list[:i] + ["\n".join(m)] + word_list[i+len(m):] + return word_list + + +def extract_data(word_list, start, line_length, end=None, merge_list=None, date_sep='/'): + word_list = word_list[word_list.index(start):] + if end is not None: + word_list = word_list[:word_list.index(end)] + if merge_list is not None: + word_list = merge_words(word_list, merge_list) + output = [] + line = [] + line_num = 0 + col = 0 + for k, w in enumerate(word_list): + if col > 0 and col % line_length == 0: + output.append(line) + line = [] + col = 0 + if col > 0 and date_sep in w and len(line) < line_length: + cnt = 0 + for i in range(len(line), line_length): + line.append('') + cnt += 1 + output.append(line) + line = [w] + col += cnt + 1 + continue + line.append(w) + col += 1 + return output diff --git a/python/src/main.py b/python/src/main.py index 85e7729..4bc2571 100644 --- a/python/src/main.py +++ b/python/src/main.py @@ -2,12 +2,16 @@ import argparse import os import contabilidad.pdf as pdf +import contabilidad.text_handler as th def main(args): filename = os.path.realpath(os.path.join(os.path.dirname(__file__), '..', 'data', args.filename)) - obj = pdf.get_text(filename, args.password) + obj = pdf.get_data(filename, args.password) print(obj) + obj = pdf.get_text(filename, args.password) + text = th.text_cleanup(obj, filename=str(args.filename)) + print(text) if __name__ == '__main__':