Camelot reading
This commit is contained in:
@ -1,8 +1,8 @@
|
|||||||
FROM php:8-fpm
|
FROM php:8-fpm
|
||||||
|
|
||||||
RUN apt-get update -y && apt-get install -y git
|
RUN apt-get update -y && apt-get install -y git libzip-dev zip
|
||||||
|
|
||||||
RUN docker-php-ext-install pdo pdo_mysql
|
RUN docker-php-ext-install pdo pdo_mysql zip
|
||||||
|
|
||||||
COPY --from=composer /usr/bin/composer /usr/bin/composer
|
COPY --from=composer /usr/bin/composer /usr/bin/composer
|
||||||
|
|
||||||
|
@ -24,7 +24,7 @@ class PdfHandler {
|
|||||||
$output []= ['filename' => $file->getBasename()];
|
$output []= ['filename' => $file->getBasename()];
|
||||||
}
|
}
|
||||||
$response = $this->client->post($this->url, ['json' => ['files' => $output]]);
|
$response = $this->client->post($this->url, ['json' => ['files' => $output]]);
|
||||||
!d(json_decode($response->getBody()));
|
$output = json_decode($response->getBody());
|
||||||
return $output;
|
return $output;
|
||||||
}
|
}
|
||||||
}
|
}
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
1
python/.gitignore
vendored
Normal file
1
python/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
**/__pycache__/
|
@ -1,8 +1,8 @@
|
|||||||
FROM python
|
FROM python
|
||||||
|
|
||||||
RUN apt-get update -y && apt-get install -y default-jre
|
RUN apt-get update -y && apt-get install -y ghostscript python3-tk libgl-dev
|
||||||
|
|
||||||
RUN pip install flask tabula-py pyyaml pypdf4 gunicorn
|
RUN pip install flask pyyaml pypdf4 gunicorn camelot-py[cv]
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
|
BIN
python/data/Scotiabank-CC-2021-10.pdf
Normal file
BIN
python/data/Scotiabank-CC-2021-10.pdf
Normal file
Binary file not shown.
@ -5,6 +5,7 @@ from flask import Flask, request
|
|||||||
import contabilidad.pdf as pdf
|
import contabilidad.pdf as pdf
|
||||||
import contabilidad.passwords as passwords
|
import contabilidad.passwords as passwords
|
||||||
import contabilidad.log as log
|
import contabilidad.log as log
|
||||||
|
import contabilidad.text_handler as th
|
||||||
|
|
||||||
|
|
||||||
app = Flask(__name__)
|
app = Flask(__name__)
|
||||||
@ -18,17 +19,21 @@ def pdf_parse():
|
|||||||
data['files'] = [data['files']]
|
data['files'] = [data['files']]
|
||||||
password_file = '/app/config/.passwords.yml'
|
password_file = '/app/config/.passwords.yml'
|
||||||
pwds = passwords.get_passwords(password_file)
|
pwds = passwords.get_passwords(password_file)
|
||||||
texts = []
|
output = []
|
||||||
for file in data['files']:
|
for file in data['files']:
|
||||||
filename = os.path.realpath(os.path.join('/app/data', file['filename']))
|
filename = os.path.realpath(os.path.join('/app/data', file['filename']))
|
||||||
|
texts = []
|
||||||
for p in pwds:
|
for p in pwds:
|
||||||
|
obj = pdf.get_data(filename, p)
|
||||||
|
print(obj)
|
||||||
obj = pdf.get_text(filename, p)
|
obj = pdf.get_text(filename, p)
|
||||||
if obj is None:
|
if obj is None:
|
||||||
continue
|
continue
|
||||||
print(obj)
|
text = th.text_cleanup(obj, file['filename'])
|
||||||
texts.append(json.dumps(obj))
|
texts.append(text)
|
||||||
return json.dumps(texts)
|
output.append({'filename': file['filename'], 'text': texts})
|
||||||
|
return json.dumps(output)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
app.run(host='0.0.0.0')
|
app.run(host='0.0.0.0', debug=True)
|
||||||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,11 +1,13 @@
|
|||||||
|
import sys
|
||||||
|
|
||||||
|
import camelot
|
||||||
import PyPDF4
|
import PyPDF4
|
||||||
import tabula
|
|
||||||
|
|
||||||
|
|
||||||
def get_pdf(file, password=''):
|
def get_pdf(file, password=''):
|
||||||
reader = PyPDF4.PdfFileReader(file)
|
reader = PyPDF4.PdfFileReader(file)
|
||||||
if reader.getIsEncrypted() and password != '':
|
if reader.getIsEncrypted() and password != '':
|
||||||
status = reader.decrypt(password=password)
|
status = reader.decrypt(password=str(password))
|
||||||
if status == 0:
|
if status == 0:
|
||||||
return None
|
return None
|
||||||
return reader
|
return reader
|
||||||
@ -16,16 +18,16 @@ def get_text(filename, password=''):
|
|||||||
reader = get_pdf(f, password)
|
reader = get_pdf(f, password)
|
||||||
if reader is None:
|
if reader is None:
|
||||||
return None
|
return None
|
||||||
print(reader.getPage(0).extractText())
|
|
||||||
texts = []
|
texts = []
|
||||||
for p in range(0, reader.getNumPages()):
|
for p in range(0, reader.getNumPages()):
|
||||||
print(p)
|
|
||||||
texts.append(reader.getPage(p).extractText())
|
texts.append(reader.getPage(p).extractText())
|
||||||
return "\n".join(texts)
|
return texts
|
||||||
|
|
||||||
|
|
||||||
def get_data(filename, password=''):
|
def get_data(filename, password=''):
|
||||||
if password == '':
|
with open(filename, 'rb') as file:
|
||||||
return tabula.read_pdf(filename, pages='all', output_format='json')
|
reader = PyPDF4.PdfFileReader(file)
|
||||||
else:
|
print(reader, file=sys.stderr)
|
||||||
return tabula.read_pdf(filename, password=password, pages='all', output_format='json')
|
if password != '' and reader.getIsEncrypted():
|
||||||
|
return camelot.read_pdf(filename, password=str(password), pages='all', output_format='json')
|
||||||
|
return camelot.read_pdf(filename, pages='all', output_format='json')
|
||||||
|
@ -1,3 +1,90 @@
|
|||||||
def text_cleanup(text):
|
def text_cleanup(text, filename: str = None):
|
||||||
lines = text.split("\n")
|
if isinstance(text, list):
|
||||||
|
output = []
|
||||||
|
for t in text:
|
||||||
|
output.append(text_cleanup(t, filename=filename))
|
||||||
|
return output
|
||||||
|
if filename is None:
|
||||||
|
return text
|
||||||
|
if 'bice' in filename.lower():
|
||||||
|
return bice(text)
|
||||||
|
if 'scotiabank' in filename.lower():
|
||||||
|
return scotiabank(text)
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def bice(text):
|
||||||
|
lines = text.split("\n\n\n")
|
||||||
print(lines)
|
print(lines)
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def scotiabank(text):
|
||||||
|
words = text.split("\n")
|
||||||
|
output = [words[0]]
|
||||||
|
output = output + extract_from_to(words, 'No. CTA.', end='VENCIMIENTO LINEA DE CREDITO', line_length=3)
|
||||||
|
output = output + extract_from_to(words, 'VENCIMIENTO LINEA DE CREDITO',
|
||||||
|
end='NOMBRE EJECUTIVO: LILIAN AVILA MANRIQUEZ', line_length=2)
|
||||||
|
output = output + extract_from_to(words, 'NOMBRE EJECUTIVO: LILIAN AVILA MANRIQUEZ', end='SALDO ANTERIOR',
|
||||||
|
line_length=1)
|
||||||
|
output = output + extract_from_to(words, 'SALDO ANTERIOR', end='FECHA', line_length=4)
|
||||||
|
output = output + extract_data(words, 'FECHA', end='ACTUALICE SIEMPRE ANTECEDENTES LEGALES, ', line_length=6,
|
||||||
|
merge_list=[['DOCTO', 'No.'], ['SALDO', 'DIARIO']])
|
||||||
|
[print(li) for li in output]
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def extract_from_to(word_list, start, line_length, end: str = None, merge_list=None):
|
||||||
|
if end is not None:
|
||||||
|
return extract_by_line(word_list[word_list.index(start):word_list.index(end)], line_length, merge_list)
|
||||||
|
return extract_by_line(word_list[word_list.index(start):], line_length, merge_list)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_by_line(word_list, line_length, merge_list=None):
|
||||||
|
if merge_list is not None:
|
||||||
|
word_list = merge_words(word_list, merge_list)
|
||||||
|
output = []
|
||||||
|
line = []
|
||||||
|
for k, w in enumerate(word_list):
|
||||||
|
if k > 0 and k % line_length == 0:
|
||||||
|
output.append(line)
|
||||||
|
line = []
|
||||||
|
line.append(w)
|
||||||
|
output.append(line)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
def merge_words(word_list, merge_list):
|
||||||
|
for m in merge_list:
|
||||||
|
i = word_list.index(m[0])
|
||||||
|
word_list = word_list[:i] + ["\n".join(m)] + word_list[i+len(m):]
|
||||||
|
return word_list
|
||||||
|
|
||||||
|
|
||||||
|
def extract_data(word_list, start, line_length, end=None, merge_list=None, date_sep='/'):
|
||||||
|
word_list = word_list[word_list.index(start):]
|
||||||
|
if end is not None:
|
||||||
|
word_list = word_list[:word_list.index(end)]
|
||||||
|
if merge_list is not None:
|
||||||
|
word_list = merge_words(word_list, merge_list)
|
||||||
|
output = []
|
||||||
|
line = []
|
||||||
|
line_num = 0
|
||||||
|
col = 0
|
||||||
|
for k, w in enumerate(word_list):
|
||||||
|
if col > 0 and col % line_length == 0:
|
||||||
|
output.append(line)
|
||||||
|
line = []
|
||||||
|
col = 0
|
||||||
|
if col > 0 and date_sep in w and len(line) < line_length:
|
||||||
|
cnt = 0
|
||||||
|
for i in range(len(line), line_length):
|
||||||
|
line.append('')
|
||||||
|
cnt += 1
|
||||||
|
output.append(line)
|
||||||
|
line = [w]
|
||||||
|
col += cnt + 1
|
||||||
|
continue
|
||||||
|
line.append(w)
|
||||||
|
col += 1
|
||||||
|
return output
|
||||||
|
@ -2,12 +2,16 @@ import argparse
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
import contabilidad.pdf as pdf
|
import contabilidad.pdf as pdf
|
||||||
|
import contabilidad.text_handler as th
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
def main(args):
|
||||||
filename = os.path.realpath(os.path.join(os.path.dirname(__file__), '..', 'data', args.filename))
|
filename = os.path.realpath(os.path.join(os.path.dirname(__file__), '..', 'data', args.filename))
|
||||||
obj = pdf.get_text(filename, args.password)
|
obj = pdf.get_data(filename, args.password)
|
||||||
print(obj)
|
print(obj)
|
||||||
|
obj = pdf.get_text(filename, args.password)
|
||||||
|
text = th.text_cleanup(obj, filename=str(args.filename))
|
||||||
|
print(text)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
Reference in New Issue
Block a user