Camelot reading

This commit is contained in:
2021-11-02 15:37:36 -03:00
parent 5ee267568a
commit 0c44554375
18 changed files with 121 additions and 22 deletions

View File

@ -1,8 +1,8 @@
FROM php:8-fpm FROM php:8-fpm
RUN apt-get update -y && apt-get install -y git RUN apt-get update -y && apt-get install -y git libzip-dev zip
RUN docker-php-ext-install pdo pdo_mysql RUN docker-php-ext-install pdo pdo_mysql zip
COPY --from=composer /usr/bin/composer /usr/bin/composer COPY --from=composer /usr/bin/composer /usr/bin/composer

View File

@ -24,7 +24,7 @@ class PdfHandler {
$output []= ['filename' => $file->getBasename()]; $output []= ['filename' => $file->getBasename()];
} }
$response = $this->client->post($this->url, ['json' => ['files' => $output]]); $response = $this->client->post($this->url, ['json' => ['files' => $output]]);
!d(json_decode($response->getBody())); $output = json_decode($response->getBody());
return $output; return $output;
} }
} }

1
python/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
**/__pycache__/

View File

@ -1,8 +1,8 @@
FROM python FROM python
RUN apt-get update -y && apt-get install -y default-jre RUN apt-get update -y && apt-get install -y ghostscript python3-tk libgl-dev
RUN pip install flask tabula-py pyyaml pypdf4 gunicorn RUN pip install flask pyyaml pypdf4 gunicorn camelot-py[cv]
WORKDIR /app WORKDIR /app

Binary file not shown.

View File

@ -5,6 +5,7 @@ from flask import Flask, request
import contabilidad.pdf as pdf import contabilidad.pdf as pdf
import contabilidad.passwords as passwords import contabilidad.passwords as passwords
import contabilidad.log as log import contabilidad.log as log
import contabilidad.text_handler as th
app = Flask(__name__) app = Flask(__name__)
@ -18,17 +19,21 @@ def pdf_parse():
data['files'] = [data['files']] data['files'] = [data['files']]
password_file = '/app/config/.passwords.yml' password_file = '/app/config/.passwords.yml'
pwds = passwords.get_passwords(password_file) pwds = passwords.get_passwords(password_file)
texts = [] output = []
for file in data['files']: for file in data['files']:
filename = os.path.realpath(os.path.join('/app/data', file['filename'])) filename = os.path.realpath(os.path.join('/app/data', file['filename']))
texts = []
for p in pwds: for p in pwds:
obj = pdf.get_data(filename, p)
print(obj)
obj = pdf.get_text(filename, p) obj = pdf.get_text(filename, p)
if obj is None: if obj is None:
continue continue
print(obj) text = th.text_cleanup(obj, file['filename'])
texts.append(json.dumps(obj)) texts.append(text)
return json.dumps(texts) output.append({'filename': file['filename'], 'text': texts})
return json.dumps(output)
if __name__ == '__main__': if __name__ == '__main__':
app.run(host='0.0.0.0') app.run(host='0.0.0.0', debug=True)

View File

@ -1,11 +1,13 @@
import sys
import camelot
import PyPDF4 import PyPDF4
import tabula
def get_pdf(file, password=''): def get_pdf(file, password=''):
reader = PyPDF4.PdfFileReader(file) reader = PyPDF4.PdfFileReader(file)
if reader.getIsEncrypted() and password != '': if reader.getIsEncrypted() and password != '':
status = reader.decrypt(password=password) status = reader.decrypt(password=str(password))
if status == 0: if status == 0:
return None return None
return reader return reader
@ -16,16 +18,16 @@ def get_text(filename, password=''):
reader = get_pdf(f, password) reader = get_pdf(f, password)
if reader is None: if reader is None:
return None return None
print(reader.getPage(0).extractText())
texts = [] texts = []
for p in range(0, reader.getNumPages()): for p in range(0, reader.getNumPages()):
print(p)
texts.append(reader.getPage(p).extractText()) texts.append(reader.getPage(p).extractText())
return "\n".join(texts) return texts
def get_data(filename, password=''): def get_data(filename, password=''):
if password == '': with open(filename, 'rb') as file:
return tabula.read_pdf(filename, pages='all', output_format='json') reader = PyPDF4.PdfFileReader(file)
else: print(reader, file=sys.stderr)
return tabula.read_pdf(filename, password=password, pages='all', output_format='json') if password != '' and reader.getIsEncrypted():
return camelot.read_pdf(filename, password=str(password), pages='all', output_format='json')
return camelot.read_pdf(filename, pages='all', output_format='json')

View File

@ -1,3 +1,90 @@
def text_cleanup(text): def text_cleanup(text, filename: str = None):
lines = text.split("\n") if isinstance(text, list):
output = []
for t in text:
output.append(text_cleanup(t, filename=filename))
return output
if filename is None:
return text
if 'bice' in filename.lower():
return bice(text)
if 'scotiabank' in filename.lower():
return scotiabank(text)
return text
def bice(text):
lines = text.split("\n\n\n")
print(lines) print(lines)
return text
def scotiabank(text):
words = text.split("\n")
output = [words[0]]
output = output + extract_from_to(words, 'No. CTA.', end='VENCIMIENTO LINEA DE CREDITO', line_length=3)
output = output + extract_from_to(words, 'VENCIMIENTO LINEA DE CREDITO',
end='NOMBRE EJECUTIVO: LILIAN AVILA MANRIQUEZ', line_length=2)
output = output + extract_from_to(words, 'NOMBRE EJECUTIVO: LILIAN AVILA MANRIQUEZ', end='SALDO ANTERIOR',
line_length=1)
output = output + extract_from_to(words, 'SALDO ANTERIOR', end='FECHA', line_length=4)
output = output + extract_data(words, 'FECHA', end='ACTUALICE SIEMPRE ANTECEDENTES LEGALES, ', line_length=6,
merge_list=[['DOCTO', 'No.'], ['SALDO', 'DIARIO']])
[print(li) for li in output]
return text
def extract_from_to(word_list, start, line_length, end: str = None, merge_list=None):
if end is not None:
return extract_by_line(word_list[word_list.index(start):word_list.index(end)], line_length, merge_list)
return extract_by_line(word_list[word_list.index(start):], line_length, merge_list)
def extract_by_line(word_list, line_length, merge_list=None):
if merge_list is not None:
word_list = merge_words(word_list, merge_list)
output = []
line = []
for k, w in enumerate(word_list):
if k > 0 and k % line_length == 0:
output.append(line)
line = []
line.append(w)
output.append(line)
return output
def merge_words(word_list, merge_list):
for m in merge_list:
i = word_list.index(m[0])
word_list = word_list[:i] + ["\n".join(m)] + word_list[i+len(m):]
return word_list
def extract_data(word_list, start, line_length, end=None, merge_list=None, date_sep='/'):
word_list = word_list[word_list.index(start):]
if end is not None:
word_list = word_list[:word_list.index(end)]
if merge_list is not None:
word_list = merge_words(word_list, merge_list)
output = []
line = []
line_num = 0
col = 0
for k, w in enumerate(word_list):
if col > 0 and col % line_length == 0:
output.append(line)
line = []
col = 0
if col > 0 and date_sep in w and len(line) < line_length:
cnt = 0
for i in range(len(line), line_length):
line.append('')
cnt += 1
output.append(line)
line = [w]
col += cnt + 1
continue
line.append(w)
col += 1
return output

View File

@ -2,12 +2,16 @@ import argparse
import os import os
import contabilidad.pdf as pdf import contabilidad.pdf as pdf
import contabilidad.text_handler as th
def main(args): def main(args):
filename = os.path.realpath(os.path.join(os.path.dirname(__file__), '..', 'data', args.filename)) filename = os.path.realpath(os.path.join(os.path.dirname(__file__), '..', 'data', args.filename))
obj = pdf.get_text(filename, args.password) obj = pdf.get_data(filename, args.password)
print(obj) print(obj)
obj = pdf.get_text(filename, args.password)
text = th.text_cleanup(obj, filename=str(args.filename))
print(text)
if __name__ == '__main__': if __name__ == '__main__':