Python
This commit is contained in:
@ -2,7 +2,7 @@ FROM python
|
||||
|
||||
RUN apt-get update -y && apt-get install -y ghostscript python3-tk libgl-dev
|
||||
|
||||
RUN pip install flask pyyaml pypdf4 gunicorn camelot-py[cv] pikepdf
|
||||
RUN pip install flask pyyaml pypdf4 gunicorn camelot-py[cv] pikepdf httpx
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
@ -12,4 +12,5 @@ EXPOSE 5000
|
||||
|
||||
WORKDIR /app/src
|
||||
|
||||
CMD ["gunicorn", "-b 0.0.0.0:5000", "app:app"]
|
||||
CMD ["python", "app.py"]
|
||||
#CMD ["gunicorn", "-b 0.0.0.0:5000", "app:app"]
|
||||
|
@ -1,3 +1,4 @@
|
||||
passwords:
|
||||
- 0839
|
||||
- 159608395
|
||||
- 15960839
|
||||
|
@ -1,22 +1,40 @@
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
from flask import Flask, request
|
||||
import httpx
|
||||
from flask import Flask, request, jsonify
|
||||
|
||||
import contabilidad.pdf as pdf
|
||||
import contabilidad.passwords as passwords
|
||||
import contabilidad.log as log
|
||||
import contabilidad.text_handler as th
|
||||
from contabilidad.log import Log
|
||||
|
||||
|
||||
app = Flask(__name__)
|
||||
log.logging['filename'] = '/var/log/python/contabilidad.log'
|
||||
log = Log('/var/log/python/contabilidad.log')
|
||||
api_key = os.environ.get('PYTHON_KEY')
|
||||
|
||||
|
||||
def validate_key(request_obj):
|
||||
if 'Authorization' in request_obj.headers:
|
||||
auth = request_obj.headers.get('Authorization')
|
||||
if isinstance(auth, list):
|
||||
auth = auth[0]
|
||||
if 'Bearer' in auth:
|
||||
auth = auth.split(' ')[1]
|
||||
return auth == api_key
|
||||
if 'API_KEY' in request_obj.values:
|
||||
return request_obj.values.get('API_KEY') == api_key
|
||||
if 'api_key' in request_obj.values:
|
||||
return request_obj.values.get('api_key') == api_key
|
||||
return False
|
||||
|
||||
|
||||
@app.route('/pdf/parse', methods=['POST'])
|
||||
def pdf_parse():
|
||||
if not validate_key(request):
|
||||
return jsonify({'message': 'Not Authorized'})
|
||||
data = request.get_json()
|
||||
if not isinstance(data['files'], list):
|
||||
data['files'] = [data['files']]
|
||||
@ -32,6 +50,11 @@ def pdf_parse():
|
||||
continue
|
||||
pdf.remove_encryption(filename, p, temp)
|
||||
obj = pdf.get_data(temp)
|
||||
try:
|
||||
text = th.text_cleanup(pdf.get_text(temp))
|
||||
except IndexError as ie:
|
||||
print(ie, file=sys.stderr)
|
||||
continue
|
||||
outputs = []
|
||||
for o in obj:
|
||||
out = json.loads(o.df.to_json(orient='records'))
|
||||
@ -48,8 +71,35 @@ def pdf_parse():
|
||||
out[i] = line
|
||||
outputs.append(out)
|
||||
os.remove(temp)
|
||||
output.append({'filename': file['filename'], 'text': outputs})
|
||||
return json.dumps(output)
|
||||
output.append({'bank': text['bank'], 'filename': file['filename'], 'tables': outputs, 'text': text['text']})
|
||||
return jsonify(output)
|
||||
|
||||
|
||||
@app.route('/cambio/get', methods=['POST'])
|
||||
def cambios():
|
||||
if not validate_key(request):
|
||||
return jsonify({'message': 'Not Authorized'})
|
||||
data = request.get_json()
|
||||
valid = {
|
||||
"CLF": "uf",
|
||||
"IVP": "ivp",
|
||||
"USD": "dolar",
|
||||
"USDo": "dolar_intercambio",
|
||||
"EUR": "euro",
|
||||
"IPC": "ipc",
|
||||
"UTM": "utm",
|
||||
"IMACEC": "imacec",
|
||||
"TPM": "tpm",
|
||||
"CUP": "libra_cobre",
|
||||
"TZD": "tasa_desempleo",
|
||||
"BTC": "bitcoin"
|
||||
}
|
||||
base_url = 'https://mindicador.cl/api/'
|
||||
url = f"{base_url}{valid[data['desde']]}/{'-'.join(list(reversed(data['fecha'].split('-'))))}"
|
||||
res = httpx.get(url)
|
||||
if res.status_code != httpx.codes.OK:
|
||||
return jsonify({'error': 'Valor no encontrado.'})
|
||||
return jsonify(res.json())
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
@ -1,19 +1,65 @@
|
||||
import os.path
|
||||
import time
|
||||
|
||||
|
||||
logging = {
|
||||
'filename': '/var/log/python/error.log'
|
||||
}
|
||||
import traceback
|
||||
|
||||
|
||||
class LOG_LEVEL:
|
||||
INFO = 'INFO'
|
||||
WARNING = 'WARNING'
|
||||
DEBUG = 'DEBUG'
|
||||
ERROR = 'ERROR'
|
||||
INFO = 0
|
||||
WARNING = 1
|
||||
DEBUG = 2
|
||||
ERROR = 4
|
||||
|
||||
@staticmethod
|
||||
def desc(level):
|
||||
mapping = {
|
||||
LOG_LEVEL.INFO: 'INFO',
|
||||
LOG_LEVEL.WARNING: 'WARNING',
|
||||
LOG_LEVEL.DEBUG: 'DEBUG',
|
||||
LOG_LEVEL.ERROR: 'ERROR'
|
||||
}
|
||||
return mapping[level]
|
||||
|
||||
|
||||
def log(message, level=LOG_LEVEL.INFO):
|
||||
filename = logging['filename']
|
||||
with open(filename, 'a') as f:
|
||||
f.write(time.strftime('[%Y-%m-%d %H:%M:%S] ') + ' - ' + level + ': ' + message)
|
||||
class Logger:
|
||||
def __init__(self):
|
||||
self._logs = []
|
||||
|
||||
def add_log(self, filename: str, min_level: int = LOG_LEVEL.INFO):
|
||||
self._logs.append({'log': Log(filename), 'level': min_level})
|
||||
self._logs.sort(key=lambda e: e['level'])
|
||||
return self
|
||||
|
||||
def log(self, message, level: int = LOG_LEVEL.INFO):
|
||||
for log in self._logs:
|
||||
if log['level'] >= level:
|
||||
log['log'].log(message, level)
|
||||
|
||||
|
||||
class Log:
|
||||
MAX_SIZE = 10 * 1024 * 1024
|
||||
|
||||
def __init__(self, filename: str = '/var/log/python/error.log'):
|
||||
self._filename = filename
|
||||
|
||||
def log(self, message, level: int = LOG_LEVEL.INFO):
|
||||
if isinstance(message, Exception):
|
||||
message = traceback.format_exc()
|
||||
if level < LOG_LEVEL.ERROR:
|
||||
level = LOG_LEVEL.ERROR
|
||||
self.rotate_file()
|
||||
with open(self._filename, 'a') as f:
|
||||
f.write(time.strftime('[%Y-%m-%d %H:%M:%S] ') + ' - ' + LOG_LEVEL.desc(level=level) + ': ' + message + "\n")
|
||||
|
||||
def rotate_file(self):
|
||||
if not os.path.isfile(self._filename):
|
||||
return
|
||||
file_size = os.path.getsize(self._filename)
|
||||
if file_size > self.MAX_SIZE:
|
||||
self.next_file()
|
||||
|
||||
def next_file(self):
|
||||
name = self._filename.split('.')
|
||||
n = 1
|
||||
if name[-2].isnumeric():
|
||||
n = int(name[-2]) + 1
|
||||
self._filename = '.'.join([name[0], str(n), name[-1]])
|
||||
|
@ -1,48 +1,112 @@
|
||||
def text_cleanup(text, filename: str = None):
|
||||
def text_cleanup(text: str):
|
||||
if isinstance(text, list):
|
||||
output = []
|
||||
for t in text:
|
||||
output.append(text_cleanup(t, filename=filename))
|
||||
return output
|
||||
if filename is None:
|
||||
return text
|
||||
if 'bice' in filename.lower():
|
||||
return bice(text)
|
||||
if 'scotiabank' in filename.lower():
|
||||
return scotiabank(text)
|
||||
return text
|
||||
text = "\n\n\n".join(text)
|
||||
if 'bice' in text.lower():
|
||||
return {'bank': 'BICE', 'text': bice(text)}
|
||||
if 'scotiabank' in text.lower():
|
||||
return {'bank': 'Scotiabank', 'text': scotiabank(text)}
|
||||
if 'TARJETA' in text:
|
||||
return {'bank': 'Scotiabank', 'text': tarjeta(text)}
|
||||
return {'bank': 'unknown', 'text': basic(text)}
|
||||
|
||||
|
||||
def bice(text):
|
||||
lines = text.split("\n\n\n")
|
||||
print(lines)
|
||||
return text
|
||||
lines = [t2.strip() for t in text.split("\n\n\n")
|
||||
for t1 in t.split("\n\n") for t2 in t1.split("\n") if t2.strip() != '']
|
||||
output = []
|
||||
output += extract_from_to(lines, 'NOMBRE DEL CLIENTE', end='LAS CONDES', line_length=3)
|
||||
ti = [t for t in lines if 'MOVIMIENTOS DE LA CUENTA CORRIENTE' in t][0]
|
||||
output += extract_from_to(lines, 'LAS CONDES', end=ti, line_length=3)
|
||||
output += [ti]
|
||||
ti = [i for i, t in enumerate(lines) if 'FECHA' in t]
|
||||
output += extract_from_to(lines, ti[0], end=ti[1], line_length=4)
|
||||
output += extract_from_to(lines, 'RESUMEN DEL PERIODO', end='SALDO INICIAL', line_length=1)
|
||||
output += extract_from_to(lines, 'SALDO INICIAL', end='LINEA SOBREGIRO AUTORIZADA', line_length=4)
|
||||
output += extract_from_to(lines, 'LINEA SOBREGIRO AUTORIZADA', end='OBSERVACIONES', line_length=3)
|
||||
output += extract_from_to(lines, 'OBSERVACIONES', line_length=1)
|
||||
return output
|
||||
|
||||
|
||||
def scotiabank(text):
|
||||
words = text.split("\n")
|
||||
words = split_words(text)
|
||||
output = [words[0]]
|
||||
output = output + extract_from_to(words, 'No. CTA.', end='VENCIMIENTO LINEA DE CREDITO', line_length=3)
|
||||
output = output + extract_from_to(words, 'VENCIMIENTO LINEA DE CREDITO',
|
||||
output += extract_from_to(words, 'No. CTA.', end='VENCIMIENTO LINEA DE CREDITO', line_length=3)
|
||||
output += extract_from_to(words, 'VENCIMIENTO LINEA DE CREDITO',
|
||||
end='NOMBRE EJECUTIVO: LILIAN AVILA MANRIQUEZ', line_length=2)
|
||||
output = output + extract_from_to(words, 'NOMBRE EJECUTIVO: LILIAN AVILA MANRIQUEZ', end='SALDO ANTERIOR',
|
||||
output += extract_from_to(words, 'NOMBRE EJECUTIVO: LILIAN AVILA MANRIQUEZ', end='SALDO ANTERIOR',
|
||||
line_length=1)
|
||||
output = output + extract_from_to(words, 'SALDO ANTERIOR', end='FECHA', line_length=4)
|
||||
output = output + extract_data(words, 'FECHA', end='ACTUALICE SIEMPRE ANTECEDENTES LEGALES, ', line_length=6,
|
||||
output += extract_from_to(words, 'SALDO ANTERIOR', end='FECHA', line_length=4)
|
||||
output += extract_data(words, 'FECHA', end='ACTUALICE SIEMPRE ANTECEDENTES LEGALES, ', line_length=6,
|
||||
merge_list=[['DOCTO', 'No.'], ['SALDO', 'DIARIO']])
|
||||
[print(li) for li in output]
|
||||
return text
|
||||
output += extract_from_to(words, 'ACTUALICE SIEMPRE ANTECEDENTES LEGALES, ', 1)
|
||||
return output
|
||||
|
||||
|
||||
def extract_from_to(word_list, start, line_length, end: str = None, merge_list=None):
|
||||
def tarjeta(text):
|
||||
words = split_words(text)
|
||||
output = ['ESTADO DE CUENTA NACIONAL DE TARJETA DE CRÉDITO']
|
||||
i = [i for i, w in enumerate(words) if 'FECHA ESTADO DE CUENTA' in w][0] + 2
|
||||
output += extract_from_to(words, 'NOMBRE DEL TITULAR', end=i, line_length=2)
|
||||
output += ['I. INFORMACIóN GENERAL']
|
||||
i = [i for i, w in enumerate(words) if 'CUPO TOTAL' in w][1]
|
||||
output += extract_from_to(words, 'CUPO TOTAL', end=i, line_length=3)
|
||||
output += extract_from_to(words, i, end='ROTATIVO', line_length=4)
|
||||
output += extract_from_to(words, 'ROTATIVO', end='TASA INTERÉS VIGENTE', line_length=3)
|
||||
output += extract_from_to(words, 'TASA INTERÉS VIGENTE',
|
||||
end='CAE se calcula sobre un supuesto de gasto mensual de UF 20 y pagadero en 12 cuotas.',
|
||||
line_length=4)
|
||||
output += extract_from_to(words, 'DESDE', end='PERÍODO FACTURADO', line_length=2)
|
||||
output += extract_from_to(words, 'PERÍODO FACTURADO', end='II.', line_length=3)
|
||||
output += ['II. DETALLE']
|
||||
output += extract_from_to(words, '1. PERÍODO ANTERIOR', end='SALDO ADEUDADO INICIO PERÍODO ANTERIOR', line_length=3)
|
||||
i = words.index('2. PERÍODO ACTUAL')
|
||||
output += extract_from_to(words, 'SALDO ADEUDADO INICIO PERÍODO ANTERIOR', end=i - 1, line_length=2,
|
||||
merge_list=[['MONTO FACTURADO A PAGAR (PERÍODO ANTERIOR)', '(A)']], merge_character=" ")
|
||||
output += ['2. PERÍODO ACTUAL']
|
||||
output += extract_from_to(words, 'LUGAR DE', end='1.TOTAL OPERACIONES', line_length=7,
|
||||
merge_list=[['OPERACIÓN', 'O COBRO'], ['TOTAL A', 'PAGAR'], ['VALOR CUOTA', 'MENSUAL']])
|
||||
i = words.index('1.TOTAL OPERACIONES') + 3
|
||||
output += extract_from_to(words, '1.TOTAL OPERACIONES', end=i, line_length=3)
|
||||
output += extract_from_to(words, i, end='TOTAL PAGOS A LA CUENTA', line_length=7)
|
||||
i = words.index('TOTAL PAGOS A LA CUENTA') + 2
|
||||
output += extract_from_to(words, 'TOTAL PAGOS A LA CUENTA', end=i, line_length=2)
|
||||
output += extract_from_to(words, i, end='TOTAL PAT A LA CUENTA', line_length=8)
|
||||
i = words.index('TOTAL PAT A LA CUENTA') + 2
|
||||
output += extract_from_to(words, 'TOTAL PAT A LA CUENTA', end=i, line_length=2)
|
||||
output += extract_from_to(words, i, end=i + 3, line_length=2,
|
||||
merge_list=[
|
||||
['2.PRODUCTOS O SERVICIOS VOLUNTARIAMENTE CONTRATADOS SIN MOVIMIENTOS', '(C)']],
|
||||
merge_character=" ")
|
||||
if '3.CARGOS, COMISIONES, IMPUESTOS Y ABONOS' in words:
|
||||
i = words.index('3.CARGOS, COMISIONES, IMPUESTOS Y ABONOS') + 3
|
||||
output += extract_from_to(words, '3.CARGOS, COMISIONES, IMPUESTOS Y ABONOS', end=i, line_length=3)
|
||||
return output
|
||||
|
||||
|
||||
def basic(text):
|
||||
return split_words(text)
|
||||
|
||||
|
||||
def split_words(text):
|
||||
if isinstance(text, list):
|
||||
text = "\n\n\n".join(text)
|
||||
words = [t.strip() for t in text.split("\n") if t.strip() != '']
|
||||
return words
|
||||
|
||||
|
||||
def extract_from_to(word_list, start, line_length, end=None, merge_list=None, merge_character="\n"):
|
||||
if not isinstance(start, int):
|
||||
start = word_list.index(start)
|
||||
if end is not None:
|
||||
return extract_by_line(word_list[word_list.index(start):word_list.index(end)], line_length, merge_list)
|
||||
return extract_by_line(word_list[word_list.index(start):], line_length, merge_list)
|
||||
if not isinstance(end, int):
|
||||
end = word_list.index(end)
|
||||
return extract_by_line(word_list[start:end], line_length, merge_list, merge_character)
|
||||
return extract_by_line(word_list[start:], line_length, merge_list, merge_character)
|
||||
|
||||
|
||||
def extract_by_line(word_list, line_length, merge_list=None):
|
||||
def extract_by_line(word_list, line_length, merge_list=None, merge_character="\n"):
|
||||
if merge_list is not None:
|
||||
word_list = merge_words(word_list, merge_list)
|
||||
word_list = merge_words(word_list, merge_list, merge_character)
|
||||
output = []
|
||||
line = []
|
||||
for k, w in enumerate(word_list):
|
||||
@ -54,22 +118,39 @@ def extract_by_line(word_list, line_length, merge_list=None):
|
||||
return output
|
||||
|
||||
|
||||
def merge_words(word_list, merge_list):
|
||||
def merge_words(word_list, merge_list, merge_character):
|
||||
for m in merge_list:
|
||||
i = word_list.index(m[0])
|
||||
word_list = word_list[:i] + ["\n".join(m)] + word_list[i+len(m):]
|
||||
ixs = find_words(word_list, m)
|
||||
if ixs is None:
|
||||
continue
|
||||
for i in ixs:
|
||||
word_list = word_list[:i] + [merge_character.join(m)] + word_list[i + len(m):]
|
||||
return word_list
|
||||
|
||||
|
||||
def extract_data(word_list, start, line_length, end=None, merge_list=None, date_sep='/'):
|
||||
def find_words(word_list, find_list):
|
||||
ixs = [i for i, w in enumerate(word_list) if find_list[0] == w]
|
||||
output = []
|
||||
for i in ixs:
|
||||
mistake = False
|
||||
for k, m in enumerate(find_list):
|
||||
if m != word_list[i + k]:
|
||||
mistake = True
|
||||
break
|
||||
if mistake:
|
||||
continue
|
||||
output.append(i)
|
||||
return output
|
||||
|
||||
|
||||
def extract_data(word_list, start, line_length, end=None, merge_list=None, merge_character="\n", date_sep='/'):
|
||||
word_list = word_list[word_list.index(start):]
|
||||
if end is not None:
|
||||
word_list = word_list[:word_list.index(end)]
|
||||
if merge_list is not None:
|
||||
word_list = merge_words(word_list, merge_list)
|
||||
word_list = merge_words(word_list, merge_list, merge_character)
|
||||
output = []
|
||||
line = []
|
||||
line_num = 0
|
||||
col = 0
|
||||
for k, w in enumerate(word_list):
|
||||
if col > 0 and col % line_length == 0:
|
||||
@ -87,4 +168,5 @@ def extract_data(word_list, start, line_length, end=None, merge_list=None, date_
|
||||
continue
|
||||
line.append(w)
|
||||
col += 1
|
||||
output.append(line)
|
||||
return output
|
||||
|
@ -3,22 +3,51 @@ import os
|
||||
|
||||
import contabilidad.pdf as pdf
|
||||
import contabilidad.text_handler as th
|
||||
from contabilidad.log import Logger, LOG_LEVEL
|
||||
import ai.dictionary as dictionary
|
||||
from ai.network import AI
|
||||
|
||||
|
||||
def parse_settings(args):
|
||||
output = {'filename': args.filename}
|
||||
if not os.path.isfile(output['filename']):
|
||||
output['filename'] = os.path.realpath(os.path.join(os.path.dirname(__file__), '..', 'data', args.filename))
|
||||
t = args.filename.split('.')
|
||||
output['temp'] = os.path.realpath(os.path.join(os.path.dirname(output['filename']), t[0] + '-temp.pdf'))
|
||||
output['dictionary'] = os.path.join(os.path.dirname(output['filename']), 'dictionary.json')
|
||||
output['network'] = os.path.join(os.path.dirname(output['filename']), 'network.json')
|
||||
output['log_file'] = args.log_file
|
||||
if not os.path.isfile(output['log_file']):
|
||||
output['log_file'] = os.path.join(os.path.dirname(os.path.dirname(output['filename'])), output['log_file'])
|
||||
output['error_log_file'] = os.path.join(os.path.dirname(output['log_file']), 'error.log')
|
||||
output['logger'] = Logger()
|
||||
output['logger'].add_log(output['log_file']).add_log(output['error_log_file'], LOG_LEVEL.ERROR)
|
||||
return output
|
||||
|
||||
|
||||
def main(args):
|
||||
filename = os.path.realpath(os.path.join(os.path.dirname(__file__), '..', 'data', args.filename))
|
||||
temp = os.path.realpath(os.path.join(os.path.dirname(__file__), '..', 'data', args.temp_filename))
|
||||
pdf.remove_encryption(filename, args.password, temp)
|
||||
obj = pdf.get_data(temp)
|
||||
obj = pdf.get_text(filename, args.password)
|
||||
text = th.text_cleanup(obj, filename=str(args.filename))
|
||||
os.remove(temp)
|
||||
settings = parse_settings(args)
|
||||
|
||||
print('Loading AI')
|
||||
network = AI(settings['dictionary'], settings['logger'])
|
||||
network.set_filename(settings['network'])
|
||||
network.add_source({'filename': settings['filename'], 'password': args.password})
|
||||
network.process_sources()
|
||||
exit()
|
||||
|
||||
print('Loading dictionary.')
|
||||
dictio = dictionary.Dictionary(settings['dictionary'], settings['logger'])
|
||||
print('Getting possible phrases.')
|
||||
dictio.process(settings['filename'], args.password)
|
||||
dictio.to_data()
|
||||
# print('Saving dictionary.')
|
||||
# dictio.save()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('-f', '--filename', type=str)
|
||||
parser.add_argument('-p', '--password', type=str, default='')
|
||||
parser.add_argument('-t', '--temp_filename', type=str)
|
||||
parser.add_argument('-l', '--log_file', type=str, default=None)
|
||||
_args = parser.parse_args()
|
||||
main(_args)
|
||||
|
Reference in New Issue
Block a user