This commit is contained in:
2021-12-06 22:13:57 -03:00
parent 34b429530f
commit 25f873c453
6 changed files with 275 additions and 66 deletions

View File

@ -2,7 +2,7 @@ FROM python
RUN apt-get update -y && apt-get install -y ghostscript python3-tk libgl-dev
RUN pip install flask pyyaml pypdf4 gunicorn camelot-py[cv] pikepdf
RUN pip install flask pyyaml pypdf4 gunicorn camelot-py[cv] pikepdf httpx
WORKDIR /app
@ -12,4 +12,5 @@ EXPOSE 5000
WORKDIR /app/src
CMD ["gunicorn", "-b 0.0.0.0:5000", "app:app"]
CMD ["python", "app.py"]
#CMD ["gunicorn", "-b 0.0.0.0:5000", "app:app"]

View File

@ -1,3 +1,4 @@
passwords:
- 0839
- 159608395
- 15960839

View File

@ -1,22 +1,40 @@
import io
import json
import os
import sys
from flask import Flask, request
import httpx
from flask import Flask, request, jsonify
import contabilidad.pdf as pdf
import contabilidad.passwords as passwords
import contabilidad.log as log
import contabilidad.text_handler as th
from contabilidad.log import Log
app = Flask(__name__)
log.logging['filename'] = '/var/log/python/contabilidad.log'
log = Log('/var/log/python/contabilidad.log')
api_key = os.environ.get('PYTHON_KEY')
def validate_key(request_obj):
if 'Authorization' in request_obj.headers:
auth = request_obj.headers.get('Authorization')
if isinstance(auth, list):
auth = auth[0]
if 'Bearer' in auth:
auth = auth.split(' ')[1]
return auth == api_key
if 'API_KEY' in request_obj.values:
return request_obj.values.get('API_KEY') == api_key
if 'api_key' in request_obj.values:
return request_obj.values.get('api_key') == api_key
return False
@app.route('/pdf/parse', methods=['POST'])
def pdf_parse():
if not validate_key(request):
return jsonify({'message': 'Not Authorized'})
data = request.get_json()
if not isinstance(data['files'], list):
data['files'] = [data['files']]
@ -32,6 +50,11 @@ def pdf_parse():
continue
pdf.remove_encryption(filename, p, temp)
obj = pdf.get_data(temp)
try:
text = th.text_cleanup(pdf.get_text(temp))
except IndexError as ie:
print(ie, file=sys.stderr)
continue
outputs = []
for o in obj:
out = json.loads(o.df.to_json(orient='records'))
@ -48,8 +71,35 @@ def pdf_parse():
out[i] = line
outputs.append(out)
os.remove(temp)
output.append({'filename': file['filename'], 'text': outputs})
return json.dumps(output)
output.append({'bank': text['bank'], 'filename': file['filename'], 'tables': outputs, 'text': text['text']})
return jsonify(output)
@app.route('/cambio/get', methods=['POST'])
def cambios():
if not validate_key(request):
return jsonify({'message': 'Not Authorized'})
data = request.get_json()
valid = {
"CLF": "uf",
"IVP": "ivp",
"USD": "dolar",
"USDo": "dolar_intercambio",
"EUR": "euro",
"IPC": "ipc",
"UTM": "utm",
"IMACEC": "imacec",
"TPM": "tpm",
"CUP": "libra_cobre",
"TZD": "tasa_desempleo",
"BTC": "bitcoin"
}
base_url = 'https://mindicador.cl/api/'
url = f"{base_url}{valid[data['desde']]}/{'-'.join(list(reversed(data['fecha'].split('-'))))}"
res = httpx.get(url)
if res.status_code != httpx.codes.OK:
return jsonify({'error': 'Valor no encontrado.'})
return jsonify(res.json())
if __name__ == '__main__':

View File

@ -1,19 +1,65 @@
import os.path
import time
logging = {
'filename': '/var/log/python/error.log'
}
import traceback
class LOG_LEVEL:
INFO = 'INFO'
WARNING = 'WARNING'
DEBUG = 'DEBUG'
ERROR = 'ERROR'
INFO = 0
WARNING = 1
DEBUG = 2
ERROR = 4
@staticmethod
def desc(level):
mapping = {
LOG_LEVEL.INFO: 'INFO',
LOG_LEVEL.WARNING: 'WARNING',
LOG_LEVEL.DEBUG: 'DEBUG',
LOG_LEVEL.ERROR: 'ERROR'
}
return mapping[level]
def log(message, level=LOG_LEVEL.INFO):
filename = logging['filename']
with open(filename, 'a') as f:
f.write(time.strftime('[%Y-%m-%d %H:%M:%S] ') + ' - ' + level + ': ' + message)
class Logger:
def __init__(self):
self._logs = []
def add_log(self, filename: str, min_level: int = LOG_LEVEL.INFO):
self._logs.append({'log': Log(filename), 'level': min_level})
self._logs.sort(key=lambda e: e['level'])
return self
def log(self, message, level: int = LOG_LEVEL.INFO):
for log in self._logs:
if log['level'] >= level:
log['log'].log(message, level)
class Log:
MAX_SIZE = 10 * 1024 * 1024
def __init__(self, filename: str = '/var/log/python/error.log'):
self._filename = filename
def log(self, message, level: int = LOG_LEVEL.INFO):
if isinstance(message, Exception):
message = traceback.format_exc()
if level < LOG_LEVEL.ERROR:
level = LOG_LEVEL.ERROR
self.rotate_file()
with open(self._filename, 'a') as f:
f.write(time.strftime('[%Y-%m-%d %H:%M:%S] ') + ' - ' + LOG_LEVEL.desc(level=level) + ': ' + message + "\n")
def rotate_file(self):
if not os.path.isfile(self._filename):
return
file_size = os.path.getsize(self._filename)
if file_size > self.MAX_SIZE:
self.next_file()
def next_file(self):
name = self._filename.split('.')
n = 1
if name[-2].isnumeric():
n = int(name[-2]) + 1
self._filename = '.'.join([name[0], str(n), name[-1]])

View File

@ -1,48 +1,112 @@
def text_cleanup(text, filename: str = None):
def text_cleanup(text: str):
if isinstance(text, list):
output = []
for t in text:
output.append(text_cleanup(t, filename=filename))
return output
if filename is None:
return text
if 'bice' in filename.lower():
return bice(text)
if 'scotiabank' in filename.lower():
return scotiabank(text)
return text
text = "\n\n\n".join(text)
if 'bice' in text.lower():
return {'bank': 'BICE', 'text': bice(text)}
if 'scotiabank' in text.lower():
return {'bank': 'Scotiabank', 'text': scotiabank(text)}
if 'TARJETA' in text:
return {'bank': 'Scotiabank', 'text': tarjeta(text)}
return {'bank': 'unknown', 'text': basic(text)}
def bice(text):
lines = text.split("\n\n\n")
print(lines)
return text
lines = [t2.strip() for t in text.split("\n\n\n")
for t1 in t.split("\n\n") for t2 in t1.split("\n") if t2.strip() != '']
output = []
output += extract_from_to(lines, 'NOMBRE DEL CLIENTE', end='LAS CONDES', line_length=3)
ti = [t for t in lines if 'MOVIMIENTOS DE LA CUENTA CORRIENTE' in t][0]
output += extract_from_to(lines, 'LAS CONDES', end=ti, line_length=3)
output += [ti]
ti = [i for i, t in enumerate(lines) if 'FECHA' in t]
output += extract_from_to(lines, ti[0], end=ti[1], line_length=4)
output += extract_from_to(lines, 'RESUMEN DEL PERIODO', end='SALDO INICIAL', line_length=1)
output += extract_from_to(lines, 'SALDO INICIAL', end='LINEA SOBREGIRO AUTORIZADA', line_length=4)
output += extract_from_to(lines, 'LINEA SOBREGIRO AUTORIZADA', end='OBSERVACIONES', line_length=3)
output += extract_from_to(lines, 'OBSERVACIONES', line_length=1)
return output
def scotiabank(text):
words = text.split("\n")
words = split_words(text)
output = [words[0]]
output = output + extract_from_to(words, 'No. CTA.', end='VENCIMIENTO LINEA DE CREDITO', line_length=3)
output = output + extract_from_to(words, 'VENCIMIENTO LINEA DE CREDITO',
end='NOMBRE EJECUTIVO: LILIAN AVILA MANRIQUEZ', line_length=2)
output = output + extract_from_to(words, 'NOMBRE EJECUTIVO: LILIAN AVILA MANRIQUEZ', end='SALDO ANTERIOR',
line_length=1)
output = output + extract_from_to(words, 'SALDO ANTERIOR', end='FECHA', line_length=4)
output = output + extract_data(words, 'FECHA', end='ACTUALICE SIEMPRE ANTECEDENTES LEGALES, ', line_length=6,
merge_list=[['DOCTO', 'No.'], ['SALDO', 'DIARIO']])
[print(li) for li in output]
return text
output += extract_from_to(words, 'No. CTA.', end='VENCIMIENTO LINEA DE CREDITO', line_length=3)
output += extract_from_to(words, 'VENCIMIENTO LINEA DE CREDITO',
end='NOMBRE EJECUTIVO: LILIAN AVILA MANRIQUEZ', line_length=2)
output += extract_from_to(words, 'NOMBRE EJECUTIVO: LILIAN AVILA MANRIQUEZ', end='SALDO ANTERIOR',
line_length=1)
output += extract_from_to(words, 'SALDO ANTERIOR', end='FECHA', line_length=4)
output += extract_data(words, 'FECHA', end='ACTUALICE SIEMPRE ANTECEDENTES LEGALES, ', line_length=6,
merge_list=[['DOCTO', 'No.'], ['SALDO', 'DIARIO']])
output += extract_from_to(words, 'ACTUALICE SIEMPRE ANTECEDENTES LEGALES, ', 1)
return output
def extract_from_to(word_list, start, line_length, end: str = None, merge_list=None):
def tarjeta(text):
words = split_words(text)
output = ['ESTADO DE CUENTA NACIONAL DE TARJETA DE CRÉDITO']
i = [i for i, w in enumerate(words) if 'FECHA ESTADO DE CUENTA' in w][0] + 2
output += extract_from_to(words, 'NOMBRE DEL TITULAR', end=i, line_length=2)
output += ['I. INFORMACIóN GENERAL']
i = [i for i, w in enumerate(words) if 'CUPO TOTAL' in w][1]
output += extract_from_to(words, 'CUPO TOTAL', end=i, line_length=3)
output += extract_from_to(words, i, end='ROTATIVO', line_length=4)
output += extract_from_to(words, 'ROTATIVO', end='TASA INTERÉS VIGENTE', line_length=3)
output += extract_from_to(words, 'TASA INTERÉS VIGENTE',
end='CAE se calcula sobre un supuesto de gasto mensual de UF 20 y pagadero en 12 cuotas.',
line_length=4)
output += extract_from_to(words, 'DESDE', end='PERÍODO FACTURADO', line_length=2)
output += extract_from_to(words, 'PERÍODO FACTURADO', end='II.', line_length=3)
output += ['II. DETALLE']
output += extract_from_to(words, '1. PERÍODO ANTERIOR', end='SALDO ADEUDADO INICIO PERÍODO ANTERIOR', line_length=3)
i = words.index('2. PERÍODO ACTUAL')
output += extract_from_to(words, 'SALDO ADEUDADO INICIO PERÍODO ANTERIOR', end=i - 1, line_length=2,
merge_list=[['MONTO FACTURADO A PAGAR (PERÍODO ANTERIOR)', '(A)']], merge_character=" ")
output += ['2. PERÍODO ACTUAL']
output += extract_from_to(words, 'LUGAR DE', end='1.TOTAL OPERACIONES', line_length=7,
merge_list=[['OPERACIÓN', 'O COBRO'], ['TOTAL A', 'PAGAR'], ['VALOR CUOTA', 'MENSUAL']])
i = words.index('1.TOTAL OPERACIONES') + 3
output += extract_from_to(words, '1.TOTAL OPERACIONES', end=i, line_length=3)
output += extract_from_to(words, i, end='TOTAL PAGOS A LA CUENTA', line_length=7)
i = words.index('TOTAL PAGOS A LA CUENTA') + 2
output += extract_from_to(words, 'TOTAL PAGOS A LA CUENTA', end=i, line_length=2)
output += extract_from_to(words, i, end='TOTAL PAT A LA CUENTA', line_length=8)
i = words.index('TOTAL PAT A LA CUENTA') + 2
output += extract_from_to(words, 'TOTAL PAT A LA CUENTA', end=i, line_length=2)
output += extract_from_to(words, i, end=i + 3, line_length=2,
merge_list=[
['2.PRODUCTOS O SERVICIOS VOLUNTARIAMENTE CONTRATADOS SIN MOVIMIENTOS', '(C)']],
merge_character=" ")
if '3.CARGOS, COMISIONES, IMPUESTOS Y ABONOS' in words:
i = words.index('3.CARGOS, COMISIONES, IMPUESTOS Y ABONOS') + 3
output += extract_from_to(words, '3.CARGOS, COMISIONES, IMPUESTOS Y ABONOS', end=i, line_length=3)
return output
def basic(text):
return split_words(text)
def split_words(text):
if isinstance(text, list):
text = "\n\n\n".join(text)
words = [t.strip() for t in text.split("\n") if t.strip() != '']
return words
def extract_from_to(word_list, start, line_length, end=None, merge_list=None, merge_character="\n"):
if not isinstance(start, int):
start = word_list.index(start)
if end is not None:
return extract_by_line(word_list[word_list.index(start):word_list.index(end)], line_length, merge_list)
return extract_by_line(word_list[word_list.index(start):], line_length, merge_list)
if not isinstance(end, int):
end = word_list.index(end)
return extract_by_line(word_list[start:end], line_length, merge_list, merge_character)
return extract_by_line(word_list[start:], line_length, merge_list, merge_character)
def extract_by_line(word_list, line_length, merge_list=None):
def extract_by_line(word_list, line_length, merge_list=None, merge_character="\n"):
if merge_list is not None:
word_list = merge_words(word_list, merge_list)
word_list = merge_words(word_list, merge_list, merge_character)
output = []
line = []
for k, w in enumerate(word_list):
@ -54,22 +118,39 @@ def extract_by_line(word_list, line_length, merge_list=None):
return output
def merge_words(word_list, merge_list):
def merge_words(word_list, merge_list, merge_character):
for m in merge_list:
i = word_list.index(m[0])
word_list = word_list[:i] + ["\n".join(m)] + word_list[i+len(m):]
ixs = find_words(word_list, m)
if ixs is None:
continue
for i in ixs:
word_list = word_list[:i] + [merge_character.join(m)] + word_list[i + len(m):]
return word_list
def extract_data(word_list, start, line_length, end=None, merge_list=None, date_sep='/'):
def find_words(word_list, find_list):
ixs = [i for i, w in enumerate(word_list) if find_list[0] == w]
output = []
for i in ixs:
mistake = False
for k, m in enumerate(find_list):
if m != word_list[i + k]:
mistake = True
break
if mistake:
continue
output.append(i)
return output
def extract_data(word_list, start, line_length, end=None, merge_list=None, merge_character="\n", date_sep='/'):
word_list = word_list[word_list.index(start):]
if end is not None:
word_list = word_list[:word_list.index(end)]
if merge_list is not None:
word_list = merge_words(word_list, merge_list)
word_list = merge_words(word_list, merge_list, merge_character)
output = []
line = []
line_num = 0
col = 0
for k, w in enumerate(word_list):
if col > 0 and col % line_length == 0:
@ -87,4 +168,5 @@ def extract_data(word_list, start, line_length, end=None, merge_list=None, date_
continue
line.append(w)
col += 1
output.append(line)
return output

View File

@ -3,22 +3,51 @@ import os
import contabilidad.pdf as pdf
import contabilidad.text_handler as th
from contabilidad.log import Logger, LOG_LEVEL
import ai.dictionary as dictionary
from ai.network import AI
def parse_settings(args):
output = {'filename': args.filename}
if not os.path.isfile(output['filename']):
output['filename'] = os.path.realpath(os.path.join(os.path.dirname(__file__), '..', 'data', args.filename))
t = args.filename.split('.')
output['temp'] = os.path.realpath(os.path.join(os.path.dirname(output['filename']), t[0] + '-temp.pdf'))
output['dictionary'] = os.path.join(os.path.dirname(output['filename']), 'dictionary.json')
output['network'] = os.path.join(os.path.dirname(output['filename']), 'network.json')
output['log_file'] = args.log_file
if not os.path.isfile(output['log_file']):
output['log_file'] = os.path.join(os.path.dirname(os.path.dirname(output['filename'])), output['log_file'])
output['error_log_file'] = os.path.join(os.path.dirname(output['log_file']), 'error.log')
output['logger'] = Logger()
output['logger'].add_log(output['log_file']).add_log(output['error_log_file'], LOG_LEVEL.ERROR)
return output
def main(args):
filename = os.path.realpath(os.path.join(os.path.dirname(__file__), '..', 'data', args.filename))
temp = os.path.realpath(os.path.join(os.path.dirname(__file__), '..', 'data', args.temp_filename))
pdf.remove_encryption(filename, args.password, temp)
obj = pdf.get_data(temp)
obj = pdf.get_text(filename, args.password)
text = th.text_cleanup(obj, filename=str(args.filename))
os.remove(temp)
settings = parse_settings(args)
print('Loading AI')
network = AI(settings['dictionary'], settings['logger'])
network.set_filename(settings['network'])
network.add_source({'filename': settings['filename'], 'password': args.password})
network.process_sources()
exit()
print('Loading dictionary.')
dictio = dictionary.Dictionary(settings['dictionary'], settings['logger'])
print('Getting possible phrases.')
dictio.process(settings['filename'], args.password)
dictio.to_data()
# print('Saving dictionary.')
# dictio.save()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-f', '--filename', type=str)
parser.add_argument('-p', '--password', type=str, default='')
parser.add_argument('-t', '--temp_filename', type=str)
parser.add_argument('-l', '--log_file', type=str, default=None)
_args = parser.parse_args()
main(_args)