diff --git a/python/Dockerfile b/python/Dockerfile index 1eedf44..96ee6c3 100644 --- a/python/Dockerfile +++ b/python/Dockerfile @@ -2,7 +2,7 @@ FROM python RUN apt-get update -y && apt-get install -y ghostscript python3-tk libgl-dev -RUN pip install flask pyyaml pypdf4 gunicorn camelot-py[cv] pikepdf +RUN pip install flask pyyaml pypdf4 gunicorn camelot-py[cv] pikepdf httpx WORKDIR /app @@ -12,4 +12,5 @@ EXPOSE 5000 WORKDIR /app/src -CMD ["gunicorn", "-b 0.0.0.0:5000", "app:app"] +CMD ["python", "app.py"] +#CMD ["gunicorn", "-b 0.0.0.0:5000", "app:app"] diff --git a/python/config/.passwords.yml b/python/config/.passwords.yml index f44e275..1acd267 100644 --- a/python/config/.passwords.yml +++ b/python/config/.passwords.yml @@ -1,3 +1,4 @@ passwords: - 0839 - 159608395 + - 15960839 diff --git a/python/src/app.py b/python/src/app.py index 5722eb2..a372365 100644 --- a/python/src/app.py +++ b/python/src/app.py @@ -1,22 +1,40 @@ -import io import json import os import sys -from flask import Flask, request +import httpx +from flask import Flask, request, jsonify import contabilidad.pdf as pdf import contabilidad.passwords as passwords -import contabilidad.log as log import contabilidad.text_handler as th +from contabilidad.log import Log app = Flask(__name__) -log.logging['filename'] = '/var/log/python/contabilidad.log' +log = Log('/var/log/python/contabilidad.log') +api_key = os.environ.get('PYTHON_KEY') + + +def validate_key(request_obj): + if 'Authorization' in request_obj.headers: + auth = request_obj.headers.get('Authorization') + if isinstance(auth, list): + auth = auth[0] + if 'Bearer' in auth: + auth = auth.split(' ')[1] + return auth == api_key + if 'API_KEY' in request_obj.values: + return request_obj.values.get('API_KEY') == api_key + if 'api_key' in request_obj.values: + return request_obj.values.get('api_key') == api_key + return False @app.route('/pdf/parse', methods=['POST']) def pdf_parse(): + if not validate_key(request): + return jsonify({'message': 'Not Authorized'}) data = request.get_json() if not isinstance(data['files'], list): data['files'] = [data['files']] @@ -32,6 +50,11 @@ def pdf_parse(): continue pdf.remove_encryption(filename, p, temp) obj = pdf.get_data(temp) + try: + text = th.text_cleanup(pdf.get_text(temp)) + except IndexError as ie: + print(ie, file=sys.stderr) + continue outputs = [] for o in obj: out = json.loads(o.df.to_json(orient='records')) @@ -48,8 +71,35 @@ def pdf_parse(): out[i] = line outputs.append(out) os.remove(temp) - output.append({'filename': file['filename'], 'text': outputs}) - return json.dumps(output) + output.append({'bank': text['bank'], 'filename': file['filename'], 'tables': outputs, 'text': text['text']}) + return jsonify(output) + + +@app.route('/cambio/get', methods=['POST']) +def cambios(): + if not validate_key(request): + return jsonify({'message': 'Not Authorized'}) + data = request.get_json() + valid = { + "CLF": "uf", + "IVP": "ivp", + "USD": "dolar", + "USDo": "dolar_intercambio", + "EUR": "euro", + "IPC": "ipc", + "UTM": "utm", + "IMACEC": "imacec", + "TPM": "tpm", + "CUP": "libra_cobre", + "TZD": "tasa_desempleo", + "BTC": "bitcoin" + } + base_url = 'https://mindicador.cl/api/' + url = f"{base_url}{valid[data['desde']]}/{'-'.join(list(reversed(data['fecha'].split('-'))))}" + res = httpx.get(url) + if res.status_code != httpx.codes.OK: + return jsonify({'error': 'Valor no encontrado.'}) + return jsonify(res.json()) if __name__ == '__main__': diff --git a/python/src/contabilidad/log.py b/python/src/contabilidad/log.py index c16024d..a1d908b 100644 --- a/python/src/contabilidad/log.py +++ b/python/src/contabilidad/log.py @@ -1,19 +1,65 @@ +import os.path import time - - -logging = { - 'filename': '/var/log/python/error.log' -} +import traceback class LOG_LEVEL: - INFO = 'INFO' - WARNING = 'WARNING' - DEBUG = 'DEBUG' - ERROR = 'ERROR' + INFO = 0 + WARNING = 1 + DEBUG = 2 + ERROR = 4 + + @staticmethod + def desc(level): + mapping = { + LOG_LEVEL.INFO: 'INFO', + LOG_LEVEL.WARNING: 'WARNING', + LOG_LEVEL.DEBUG: 'DEBUG', + LOG_LEVEL.ERROR: 'ERROR' + } + return mapping[level] -def log(message, level=LOG_LEVEL.INFO): - filename = logging['filename'] - with open(filename, 'a') as f: - f.write(time.strftime('[%Y-%m-%d %H:%M:%S] ') + ' - ' + level + ': ' + message) +class Logger: + def __init__(self): + self._logs = [] + + def add_log(self, filename: str, min_level: int = LOG_LEVEL.INFO): + self._logs.append({'log': Log(filename), 'level': min_level}) + self._logs.sort(key=lambda e: e['level']) + return self + + def log(self, message, level: int = LOG_LEVEL.INFO): + for log in self._logs: + if log['level'] >= level: + log['log'].log(message, level) + + +class Log: + MAX_SIZE = 10 * 1024 * 1024 + + def __init__(self, filename: str = '/var/log/python/error.log'): + self._filename = filename + + def log(self, message, level: int = LOG_LEVEL.INFO): + if isinstance(message, Exception): + message = traceback.format_exc() + if level < LOG_LEVEL.ERROR: + level = LOG_LEVEL.ERROR + self.rotate_file() + with open(self._filename, 'a') as f: + f.write(time.strftime('[%Y-%m-%d %H:%M:%S] ') + ' - ' + LOG_LEVEL.desc(level=level) + ': ' + message + "\n") + + def rotate_file(self): + if not os.path.isfile(self._filename): + return + file_size = os.path.getsize(self._filename) + if file_size > self.MAX_SIZE: + self.next_file() + + def next_file(self): + name = self._filename.split('.') + n = 1 + if name[-2].isnumeric(): + n = int(name[-2]) + 1 + self._filename = '.'.join([name[0], str(n), name[-1]]) diff --git a/python/src/contabilidad/text_handler.py b/python/src/contabilidad/text_handler.py index 27690ad..6d5240c 100644 --- a/python/src/contabilidad/text_handler.py +++ b/python/src/contabilidad/text_handler.py @@ -1,48 +1,112 @@ -def text_cleanup(text, filename: str = None): +def text_cleanup(text: str): if isinstance(text, list): - output = [] - for t in text: - output.append(text_cleanup(t, filename=filename)) - return output - if filename is None: - return text - if 'bice' in filename.lower(): - return bice(text) - if 'scotiabank' in filename.lower(): - return scotiabank(text) - return text + text = "\n\n\n".join(text) + if 'bice' in text.lower(): + return {'bank': 'BICE', 'text': bice(text)} + if 'scotiabank' in text.lower(): + return {'bank': 'Scotiabank', 'text': scotiabank(text)} + if 'TARJETA' in text: + return {'bank': 'Scotiabank', 'text': tarjeta(text)} + return {'bank': 'unknown', 'text': basic(text)} def bice(text): - lines = text.split("\n\n\n") - print(lines) - return text + lines = [t2.strip() for t in text.split("\n\n\n") + for t1 in t.split("\n\n") for t2 in t1.split("\n") if t2.strip() != ''] + output = [] + output += extract_from_to(lines, 'NOMBRE DEL CLIENTE', end='LAS CONDES', line_length=3) + ti = [t for t in lines if 'MOVIMIENTOS DE LA CUENTA CORRIENTE' in t][0] + output += extract_from_to(lines, 'LAS CONDES', end=ti, line_length=3) + output += [ti] + ti = [i for i, t in enumerate(lines) if 'FECHA' in t] + output += extract_from_to(lines, ti[0], end=ti[1], line_length=4) + output += extract_from_to(lines, 'RESUMEN DEL PERIODO', end='SALDO INICIAL', line_length=1) + output += extract_from_to(lines, 'SALDO INICIAL', end='LINEA SOBREGIRO AUTORIZADA', line_length=4) + output += extract_from_to(lines, 'LINEA SOBREGIRO AUTORIZADA', end='OBSERVACIONES', line_length=3) + output += extract_from_to(lines, 'OBSERVACIONES', line_length=1) + return output def scotiabank(text): - words = text.split("\n") + words = split_words(text) output = [words[0]] - output = output + extract_from_to(words, 'No. CTA.', end='VENCIMIENTO LINEA DE CREDITO', line_length=3) - output = output + extract_from_to(words, 'VENCIMIENTO LINEA DE CREDITO', - end='NOMBRE EJECUTIVO: LILIAN AVILA MANRIQUEZ', line_length=2) - output = output + extract_from_to(words, 'NOMBRE EJECUTIVO: LILIAN AVILA MANRIQUEZ', end='SALDO ANTERIOR', - line_length=1) - output = output + extract_from_to(words, 'SALDO ANTERIOR', end='FECHA', line_length=4) - output = output + extract_data(words, 'FECHA', end='ACTUALICE SIEMPRE ANTECEDENTES LEGALES, ', line_length=6, - merge_list=[['DOCTO', 'No.'], ['SALDO', 'DIARIO']]) - [print(li) for li in output] - return text + output += extract_from_to(words, 'No. CTA.', end='VENCIMIENTO LINEA DE CREDITO', line_length=3) + output += extract_from_to(words, 'VENCIMIENTO LINEA DE CREDITO', + end='NOMBRE EJECUTIVO: LILIAN AVILA MANRIQUEZ', line_length=2) + output += extract_from_to(words, 'NOMBRE EJECUTIVO: LILIAN AVILA MANRIQUEZ', end='SALDO ANTERIOR', + line_length=1) + output += extract_from_to(words, 'SALDO ANTERIOR', end='FECHA', line_length=4) + output += extract_data(words, 'FECHA', end='ACTUALICE SIEMPRE ANTECEDENTES LEGALES, ', line_length=6, + merge_list=[['DOCTO', 'No.'], ['SALDO', 'DIARIO']]) + output += extract_from_to(words, 'ACTUALICE SIEMPRE ANTECEDENTES LEGALES, ', 1) + return output -def extract_from_to(word_list, start, line_length, end: str = None, merge_list=None): +def tarjeta(text): + words = split_words(text) + output = ['ESTADO DE CUENTA NACIONAL DE TARJETA DE CRÉDITO'] + i = [i for i, w in enumerate(words) if 'FECHA ESTADO DE CUENTA' in w][0] + 2 + output += extract_from_to(words, 'NOMBRE DEL TITULAR', end=i, line_length=2) + output += ['I. INFORMACIóN GENERAL'] + i = [i for i, w in enumerate(words) if 'CUPO TOTAL' in w][1] + output += extract_from_to(words, 'CUPO TOTAL', end=i, line_length=3) + output += extract_from_to(words, i, end='ROTATIVO', line_length=4) + output += extract_from_to(words, 'ROTATIVO', end='TASA INTERÉS VIGENTE', line_length=3) + output += extract_from_to(words, 'TASA INTERÉS VIGENTE', + end='CAE se calcula sobre un supuesto de gasto mensual de UF 20 y pagadero en 12 cuotas.', + line_length=4) + output += extract_from_to(words, 'DESDE', end='PERÍODO FACTURADO', line_length=2) + output += extract_from_to(words, 'PERÍODO FACTURADO', end='II.', line_length=3) + output += ['II. DETALLE'] + output += extract_from_to(words, '1. PERÍODO ANTERIOR', end='SALDO ADEUDADO INICIO PERÍODO ANTERIOR', line_length=3) + i = words.index('2. PERÍODO ACTUAL') + output += extract_from_to(words, 'SALDO ADEUDADO INICIO PERÍODO ANTERIOR', end=i - 1, line_length=2, + merge_list=[['MONTO FACTURADO A PAGAR (PERÍODO ANTERIOR)', '(A)']], merge_character=" ") + output += ['2. PERÍODO ACTUAL'] + output += extract_from_to(words, 'LUGAR DE', end='1.TOTAL OPERACIONES', line_length=7, + merge_list=[['OPERACIÓN', 'O COBRO'], ['TOTAL A', 'PAGAR'], ['VALOR CUOTA', 'MENSUAL']]) + i = words.index('1.TOTAL OPERACIONES') + 3 + output += extract_from_to(words, '1.TOTAL OPERACIONES', end=i, line_length=3) + output += extract_from_to(words, i, end='TOTAL PAGOS A LA CUENTA', line_length=7) + i = words.index('TOTAL PAGOS A LA CUENTA') + 2 + output += extract_from_to(words, 'TOTAL PAGOS A LA CUENTA', end=i, line_length=2) + output += extract_from_to(words, i, end='TOTAL PAT A LA CUENTA', line_length=8) + i = words.index('TOTAL PAT A LA CUENTA') + 2 + output += extract_from_to(words, 'TOTAL PAT A LA CUENTA', end=i, line_length=2) + output += extract_from_to(words, i, end=i + 3, line_length=2, + merge_list=[ + ['2.PRODUCTOS O SERVICIOS VOLUNTARIAMENTE CONTRATADOS SIN MOVIMIENTOS', '(C)']], + merge_character=" ") + if '3.CARGOS, COMISIONES, IMPUESTOS Y ABONOS' in words: + i = words.index('3.CARGOS, COMISIONES, IMPUESTOS Y ABONOS') + 3 + output += extract_from_to(words, '3.CARGOS, COMISIONES, IMPUESTOS Y ABONOS', end=i, line_length=3) + return output + + +def basic(text): + return split_words(text) + + +def split_words(text): + if isinstance(text, list): + text = "\n\n\n".join(text) + words = [t.strip() for t in text.split("\n") if t.strip() != ''] + return words + + +def extract_from_to(word_list, start, line_length, end=None, merge_list=None, merge_character="\n"): + if not isinstance(start, int): + start = word_list.index(start) if end is not None: - return extract_by_line(word_list[word_list.index(start):word_list.index(end)], line_length, merge_list) - return extract_by_line(word_list[word_list.index(start):], line_length, merge_list) + if not isinstance(end, int): + end = word_list.index(end) + return extract_by_line(word_list[start:end], line_length, merge_list, merge_character) + return extract_by_line(word_list[start:], line_length, merge_list, merge_character) -def extract_by_line(word_list, line_length, merge_list=None): +def extract_by_line(word_list, line_length, merge_list=None, merge_character="\n"): if merge_list is not None: - word_list = merge_words(word_list, merge_list) + word_list = merge_words(word_list, merge_list, merge_character) output = [] line = [] for k, w in enumerate(word_list): @@ -54,22 +118,39 @@ def extract_by_line(word_list, line_length, merge_list=None): return output -def merge_words(word_list, merge_list): +def merge_words(word_list, merge_list, merge_character): for m in merge_list: - i = word_list.index(m[0]) - word_list = word_list[:i] + ["\n".join(m)] + word_list[i+len(m):] + ixs = find_words(word_list, m) + if ixs is None: + continue + for i in ixs: + word_list = word_list[:i] + [merge_character.join(m)] + word_list[i + len(m):] return word_list -def extract_data(word_list, start, line_length, end=None, merge_list=None, date_sep='/'): +def find_words(word_list, find_list): + ixs = [i for i, w in enumerate(word_list) if find_list[0] == w] + output = [] + for i in ixs: + mistake = False + for k, m in enumerate(find_list): + if m != word_list[i + k]: + mistake = True + break + if mistake: + continue + output.append(i) + return output + + +def extract_data(word_list, start, line_length, end=None, merge_list=None, merge_character="\n", date_sep='/'): word_list = word_list[word_list.index(start):] if end is not None: word_list = word_list[:word_list.index(end)] if merge_list is not None: - word_list = merge_words(word_list, merge_list) + word_list = merge_words(word_list, merge_list, merge_character) output = [] line = [] - line_num = 0 col = 0 for k, w in enumerate(word_list): if col > 0 and col % line_length == 0: @@ -87,4 +168,5 @@ def extract_data(word_list, start, line_length, end=None, merge_list=None, date_ continue line.append(w) col += 1 + output.append(line) return output diff --git a/python/src/main.py b/python/src/main.py index 229b132..bcbd999 100644 --- a/python/src/main.py +++ b/python/src/main.py @@ -3,22 +3,51 @@ import os import contabilidad.pdf as pdf import contabilidad.text_handler as th +from contabilidad.log import Logger, LOG_LEVEL +import ai.dictionary as dictionary +from ai.network import AI + + +def parse_settings(args): + output = {'filename': args.filename} + if not os.path.isfile(output['filename']): + output['filename'] = os.path.realpath(os.path.join(os.path.dirname(__file__), '..', 'data', args.filename)) + t = args.filename.split('.') + output['temp'] = os.path.realpath(os.path.join(os.path.dirname(output['filename']), t[0] + '-temp.pdf')) + output['dictionary'] = os.path.join(os.path.dirname(output['filename']), 'dictionary.json') + output['network'] = os.path.join(os.path.dirname(output['filename']), 'network.json') + output['log_file'] = args.log_file + if not os.path.isfile(output['log_file']): + output['log_file'] = os.path.join(os.path.dirname(os.path.dirname(output['filename'])), output['log_file']) + output['error_log_file'] = os.path.join(os.path.dirname(output['log_file']), 'error.log') + output['logger'] = Logger() + output['logger'].add_log(output['log_file']).add_log(output['error_log_file'], LOG_LEVEL.ERROR) + return output def main(args): - filename = os.path.realpath(os.path.join(os.path.dirname(__file__), '..', 'data', args.filename)) - temp = os.path.realpath(os.path.join(os.path.dirname(__file__), '..', 'data', args.temp_filename)) - pdf.remove_encryption(filename, args.password, temp) - obj = pdf.get_data(temp) - obj = pdf.get_text(filename, args.password) - text = th.text_cleanup(obj, filename=str(args.filename)) - os.remove(temp) + settings = parse_settings(args) + + print('Loading AI') + network = AI(settings['dictionary'], settings['logger']) + network.set_filename(settings['network']) + network.add_source({'filename': settings['filename'], 'password': args.password}) + network.process_sources() + exit() + + print('Loading dictionary.') + dictio = dictionary.Dictionary(settings['dictionary'], settings['logger']) + print('Getting possible phrases.') + dictio.process(settings['filename'], args.password) + dictio.to_data() + # print('Saving dictionary.') + # dictio.save() if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-f', '--filename', type=str) parser.add_argument('-p', '--password', type=str, default='') - parser.add_argument('-t', '--temp_filename', type=str) + parser.add_argument('-l', '--log_file', type=str, default=None) _args = parser.parse_args() main(_args)