4 Commits

Author SHA1 Message Date
605c905f5d Python 2021-12-22 21:53:30 -03:00
960c418848 Python 2021-12-09 21:14:28 -03:00
25f873c453 Python 2021-12-06 22:13:57 -03:00
34b429530f Python 2021-12-06 22:13:06 -03:00
11 changed files with 929 additions and 66 deletions

View File

@ -2,7 +2,7 @@ FROM python
RUN apt-get update -y && apt-get install -y ghostscript python3-tk libgl-dev
RUN pip install flask pyyaml pypdf4 gunicorn camelot-py[cv] pikepdf
RUN pip install flask pyyaml pypdf4 gunicorn camelot-py[cv] pikepdf httpx
WORKDIR /app
@ -12,4 +12,5 @@ EXPOSE 5000
WORKDIR /app/src
CMD ["gunicorn", "-b 0.0.0.0:5000", "app:app"]
CMD ["python", "app.py"]
#CMD ["gunicorn", "-b 0.0.0.0:5000", "app:app"]

View File

@ -1,3 +1,4 @@
passwords:
- 0839
- 159608395
- 15960839

Binary file not shown.

285
python/src/ai/dictionary.py Normal file
View File

@ -0,0 +1,285 @@
import json
import os
import numpy as np
import sklearn
import enlighten
from sklearn.preprocessing import LabelEncoder
import src.contabilidad.pdf as pdf
import src.contabilidad.text_handler as th
from src.ai.models import Phrase, phrase_factory, Word, word_factory
from src.contabilidad.log import LOG_LEVEL
class Dictionary:
def __init__(self, filename, logger):
self.filename = filename
self._logger = logger
self.__processed = []
self.__phrases = None
self.__words = None
self.load()
def load(self):
if not os.path.isfile(self.filename):
return
with open(self.filename, 'r') as file:
data = json.load(file)
if 'words' in data.keys():
self.__words = []
[self.__words.append(word_factory(w)) for w in data['words']]
if 'phrases' in data.keys():
self.__phrases = []
[self.__phrases.append(phrase_factory(ph)) for ph in data['phrases']]
if 'processed' in data.keys():
self.__processed = []
self.__processed = data['processed']
def save(self):
self.sort_words()
self.sort_phrases()
with open(self.filename, 'w') as file:
json.dump(self.to_json(), file, indent=2)
def to_data(self):
encoder = LabelEncoder()
data = encoder.fit_transform([w.get_word() for w in self.get_words()])
[self.__words[i].set_fit(f) for i, f in enumerate(data)]
print(data)
# return [ph.to_data() for ph in self.get_phrases()]
def to_json(self):
output = {
'processed': [],
'words': [],
'phrases': []
}
if self.__processed is not None and len(self.__processed) > 0:
output['processed'] = self.__processed
if self.__words is not None and len(self.__words) > 0:
output['words'] = [w.to_json() for w in self.__words]
if self.__phrases is not None and len(self.__phrases) > 0:
output['phrases'] = [p.to_json() for p in self.__phrases]
return output
def find_phrase(self, phrase: Phrase = None, phrase_dict: dict = None, phrase_list: list = None):
if not self.__phrases:
return -1
if phrase is not None:
phrase_list = [w.get_word() for w in phrase.get_words()]
elif phrase_dict is not None:
phrase_list = phrase_dict['words']
elif phrase_list is not None:
pass
else:
return -1
return find_phrase(self.__phrases, phrase_list)
def add_phrase(self, phrase: Phrase = None, phrase_dict: dict = None, phrase_list: list = None):
if self.__phrases is None:
self.__phrases = []
if phrase is not None:
pass
elif phrase_dict is not None:
phrase = phrase_factory(phrase_dict)
elif phrase_list is not None:
phrase = phrase_factory({'words': phrase_list})
else:
return self
i = self.find_phrase(phrase)
if i > -1:
self.__phrases[i].add_freq()
return self
self.__phrases.append(phrase)
return self
def add_phrases(self, phrase_list: list):
if self.__phrases is None:
self.__phrases = []
phs = [sorted(w.get_word() for w in p) for p in self.__phrases]
with enlighten.get_manager() as manager:
with manager.counter(total=len(phrase_list), desc='Phrases', unit='phrases', color='green') as bar1:
for i, phrase in enumerate(phrase_list):
# print(f'Adding phrase {i}.')
p2 = sorted([w.get_word() for w in phrase])
if p2 in phs:
k = phs.index(p2)
self.__phrases[k].add_freq()
continue
ph = phrase_factory({'words': phrase})
self.__phrases.append(ph)
phs.append(p2)
bar1.update()
def get_phrases(self):
return self.__phrases
def sort_phrases(self):
if self.__phrases is None:
return
try:
def sort_phrase(p):
if p is None:
return 0
if isinstance(p, Phrase):
return p.get_freq(), p.get_type().get_desc(), len(p.get_words())
return p['frequency'], p['type']['description'], len(p['words'])
self.__phrases = sorted(self.__phrases,
key=sort_phrase)
except Exception as e:
self._logger.log(repr(self.__phrases), LOG_LEVEL.ERROR)
self._logger.log(e)
return self
def sort_words(self):
if self.__words is None:
return
try:
def sort_word(w):
if w is None:
return 0
if isinstance(w, Word):
return w.get_freq(), w.get_type().get_desc(), w.get_word()
return w['frequency'], w['type']['description'], w['word']
self.__words = sorted(self.__words, key=sort_word, reverse=True)
except Exception as e:
self._logger.log(repr(self.__words))
self._logger.log(e)
return self
def find_word(self, word: Word = None, word_dict: dict = None, word_str: str = None):
if not self.__words:
return -1
if word is not None:
word_str = word.get_word()
elif word_dict is not None:
word_str = word_dict['word']
elif word_str is not None:
pass
else:
return -1
return find_word(self.__words, word_str)
def add_word(self, word: Word = None, word_dict: dict = None, word_str: str = None):
if self.__words is None:
self.__words = []
if word is not None:
pass
elif word_dict is not None:
word = word_factory(word_dict)
elif word_str is not None:
word = word_factory({'word': word_str})
else:
return self
i = self.find_word(word)
if i > -1:
self.__words[i].add_freq()
return self
self.__words.append(word)
return self
def add_words(self, words: list):
[self.add_word(word=w) for w in words if isinstance(w, Word)]
[self.add_word(word_dict=w) for w in words if isinstance(w, dict)]
[self.add_word(word_str=w) for w in words if isinstance(w, str)]
return self
def get_words(self):
return filter_unique_words(self.__words)
def match_words(self, word_list: list):
new_list = []
for w in word_list:
wi = self.find_word(word_str=w)
new_list.append(self.__words[wi])
return new_list
def append_to_phrase(self, seed: list = None, length: int = 1):
if seed is None:
return [self.__words[0]]
max_index = max(seed) + length
if max_index > len(self.__words):
if length == 1:
return False
return self.append_to_phrase(seed, length - 1)
return seed + self.__words[max_index]
def get_possible_phrases(self, word_list):
print('Adding words.')
self.add_words(word_list)
print('Creating phrases.')
with enlighten.get_manager() as manager:
with manager.counter(total=len(word_list)**2, desc='Phrases', unit='words', color='red') as bar1:
phrases = []
for length in range(1, len(word_list) + 1):
bar2 = bar1.add_subcounter(color='green')
for start in range(0, len(word_list)):
phrase = build_phrase(word_list, start, start + length)
phrase = self.match_words(phrase)
phrases.append(phrase)
start += length
bar2.update()
bar1.update()
print(f'Created {len(phrases)} phrases.')
phrases = sorted(phrases, key=lambda e: len(e))
print('Adding phrases.')
# Really slow (~115000 phrases in one pdf)
self.add_phrases(phrases)
return self.__phrases
def is_processed(self, filename: str):
return os.path.basename(filename) in self.__processed
def process(self, filename: str, password: str = None):
if self.is_processed(filename):
print('Already processed.')
return
t = filename.split('.')
temp = os.path.realpath(os.path.join(os.path.dirname(filename), t[0] + '-temp.pdf'))
print('Removing PDF encryption.')
pdf.remove_encryption(filename, password, temp)
print('Getting text')
obj = pdf.get_text(temp)
os.remove(temp)
print('Getting possible phrases.')
phrases = self.get_possible_phrases(th.split_words(obj))
self.__processed.append(os.path.basename(filename))
return phrases
def build_phrase(word_list, start: int, end: int = None):
if end is None:
return word_list[start:]
return word_list[start:end]
def filter_unique_words(words):
new_list = []
for w in words:
if w not in new_list:
new_list.append(w)
return new_list
def validate_phrase(phrase):
return True
def find_phrase(phrases: list, phrase: list):
phrase_list = [sorted([w.get_word() for w in p.get_words()]) for p in phrases]
sphrase = sorted(phrase)
if sphrase in phrase_list:
return phrase_list.index(sphrase)
return -1
def find_word(words: list, word: str):
word_list = [w.get_word() for w in words]
if word in word_list:
return word_list.index(word)
return -1

243
python/src/ai/models.py Normal file
View File

@ -0,0 +1,243 @@
import json
class Type:
def __init__(self, _id, _description):
self.__id = _id
self.__description = _description
def get_id(self):
return self.__id
def get_desc(self):
return self.__description
def to_json(self):
return self.get_id()
def __repr__(self):
return json.dumps({
'id': self.get_id(),
'description': self.get_desc()
})
def type_factory(_type: str, _id: int):
if _type == 'Word' or _type == 'WordType':
t = WordType()
elif _type == 'Phrase' or _type == 'PhraseType':
t = PhraseType()
else:
return None
t.load(_id)
return t
class WordType(Type):
STRING = 0
NUMERIC = 1
CURRENCY = 2
DATE = 4
def __init__(self):
super().__init__(0, 'string')
def load(self, word_type: int):
if word_type == self.STRING:
self.__description = 'string'
elif word_type == self.NUMERIC:
self.__description = 'numeric'
elif word_type == self.CURRENCY:
self.__description = 'currency'
elif word_type == self.DATE:
self.__description = 'date'
return self
class PhraseType(Type):
TEXT = 0
TITLE = 1
HEADER = 2
MOVEMENT = 4
INVALID = 99
def __init__(self):
super(PhraseType, self).__init__(0, 'text')
def load(self, phrase_type: int):
if phrase_type == self.TEXT:
self.__description = 'text'
elif phrase_type == self.TITLE:
self.__description = 'title'
elif phrase_type == self.HEADER:
self.__description = 'header'
class Word:
def __init__(self):
self.__id = 0
self.__word = None
self.__type_id = 0
self.__type = None
self.__frequency = 1
def set_id(self, idx: int):
self.__id = idx
return self
def set_word(self, word: str):
self.__word = word
return self
def set_type(self, word_type):
if isinstance(word_type, WordType):
self.__type_id = word_type.get_id()
# self.__type = word_type
if isinstance(word_type, int):
self.__type_id = word_type
# self.__type = type_factory('Word', word_type)
return self
def add_freq(self, amount: int = 1):
self.__frequency += amount
return self
def get_id(self) -> int:
return self.__id
def get_word(self) -> str:
return self.__word
def get_type_id(self) -> int:
return self.__type_id
def get_type(self) -> WordType:
if self.__type is None:
self.__type = type_factory('Word', self.__type_id)
return self.__type
def get_freq(self) -> int:
return self.__frequency
def to_json(self) -> dict:
output = {
'id': self.get_id(),
'word': self.get_word(),
'type': self.get_type_id(),
'freq': self.get_freq()
}
return output
def __repr__(self):
return json.dumps(self.to_json())
def word_factory(word: dict) -> Word:
w = Word()
w.set_id(word['id'])
w.set_word(word['word'])
if 'type' in word:
w.set_type(word['type'])
if 'freq' in word:
w.add_freq(word['freq'] - 1)
return w
class Phrase:
def __init__(self):
self.__id = 0
self.__words = None
self.__type_id = 0
self.__type = None
self.__frequency = 1
def set_id(self, idx: int):
self.__id = idx
return self
def add_word(self, word):
if isinstance(word, Word):
self.__words.append(word.get_id())
if isinstance(word, dict):
if 'id' in word:
self.__words.append(word['id'])
if isinstance(word, int):
self.__words.append(word)
return self
def set_words(self, words: list):
if self.__words is None:
self.__words = []
for w in words:
if isinstance(w, Word):
self.add_word(w)
if isinstance(w, dict):
self.add_word(w)
if isinstance(w, int):
self.add_word(w)
return self
def set_type(self, phrase_type):
if isinstance(phrase_type, PhraseType):
self.__type_id = phrase_type.get_id()
# self.__type = phrase_type
if isinstance(phrase_type, int):
self.__type_id = phrase_type
# self.__type = type_factory('Phrase', phrase_type)
return self
def add_freq(self, amount: int = 1):
self.__frequency += amount
return self
def get_id(self) -> int:
return self.__id
def get_words(self) -> list:
return self.__words
def get_type_id(self) -> int:
return self.__type_id
def get_type(self) -> PhraseType:
if self.__type is None:
self.__type = type_factory('Phrase', self.__type_id)
return self.__type
def get_freq(self) -> int:
return self.__frequency
def match(self, word_list: list):
if len(word_list) != len(self.__words):
return False
new_words = sorted(self.__words)
new_list = sorted(word_list)
if new_words == new_list:
return True
return False
def to_json(self):
output = {
'id': self.get_id(),
'words': self.get_words(),
'type': self.get_type_id(),
'freq': self.get_freq()
}
return output
def __repr__(self):
return json.dumps(self.to_json())
def __len__(self):
return len(self.get_words())
def phrase_factory(phrase: dict) -> Phrase:
ph = Phrase()
ph.set_id(phrase['id'])
ph.set_words(phrase['words'])
if 'type' in phrase:
ph.set_type(phrase['type'])
if 'freq' in phrase:
ph.add_freq(phrase['freq'] - 1)
return ph

126
python/src/ai/network.py Normal file
View File

@ -0,0 +1,126 @@
import json
import os
import time
import timeit
import tensorflow as tf
import sklearn
import numpy as np
from sklearn.preprocessing import LabelEncoder
import src.contabilidad.pdf as pdf
import src.contabilidad.text_handler as th
class Layer:
def __init__(self):
self.__weights = None
self.__bias = None
def set_size(self, inputs: int, size: int):
self.__weights = [[0 for j in range(0, inputs)] for i in range(0, size)]
self.__bias = [0 for i in range(0, size)]
def add_weight(self, vector: list, idx: int = None):
if idx is None:
self.__weights.append(vector)
return self
self.__weights = self.__weights[:idx] + [vector] + self.__weights[idx:]
return self
def set_weight(self, value: float, weight_index: int, input_index: int):
self.__weights[weight_index][input_index] = value
def set_bias(self, value: list):
self.__bias = value
def train(self, input_values: list, output_values: list):
output = self.get_output(input_values)
errors = []
for i, v in enumerate(output):
error = (output_values[i] - v) / output_values[i]
new_value = v * error
def to_json(self):
return {
'bias': self.__bias,
'weights': self.__weights
}
def get_output(self, vector: list):
output = []
for i, weight in enumerate(self.__weights):
val = 0
for j, v in enumerate(weight):
val += v * vector[j]
output[i] = val + self.__bias[i]
return output
def layer_factory(layer_dict: dict):
layer = Layer()
layer.set_bias(layer_dict['bias'])
[layer.add_weight(w) for w in layer_dict['weights']]
return layer
class Network:
def __init__(self, filename: str):
self._filename = filename
self.__layers = None
def load(self):
with open(self._filename) as f:
data = json.load(f)
if 'layers' in data.keys():
self.add_layers(data['layers'])
def add_layers(self, layers: list):
for lr in layers:
layer = layer_factory(lr)
self.__layers.append(layer)
class AI:
def __init__(self, dictionary_filename, logger):
self.__dict = None
self.__network = None
self.__sources = None
self._phrases = None
self.filename = ''
def add_source(self, text):
if self.__sources is None:
self.__sources = []
self.__sources.append(text)
return self
def set_filename(self, filename: str):
self.filename = filename
return self
def process_sources(self):
for source in self.__sources:
self.process(**source)
def process(self, filename, password):
encoder = LabelEncoder()
t = filename.split('.')
temp = os.path.realpath(os.path.join(os.path.dirname(filename), t[0] + '-temp.pdf'))
pdf.remove_encryption(filename, password, temp)
obj = pdf.get_text(temp)
os.remove(temp)
word_list = th.split_words(obj)
fits = encoder.fit_transform(word_list)
phrases = []
for length in range(1, len(word_list) + 1):
for start in range(0, len(word_list)):
phrase = word_list[start:(start + length)]
phrase = np.append(np.array([fits[word_list.index(w)] for w in phrase]),
np.zeros([len(word_list) - len(phrase)]))
phrases.append(phrase)
phrases = np.array(phrases)
self._phrases = phrases
def active_train(self):
pass

View File

@ -1,22 +1,40 @@
import io
import json
import os
import sys
from flask import Flask, request
import httpx
from flask import Flask, request, jsonify
import contabilidad.pdf as pdf
import contabilidad.passwords as passwords
import contabilidad.log as log
import contabilidad.text_handler as th
from contabilidad.log import Log
app = Flask(__name__)
log.logging['filename'] = '/var/log/python/contabilidad.log'
log = Log('/var/log/python/contabilidad.log')
api_key = os.environ.get('PYTHON_KEY')
def validate_key(request_obj):
if 'Authorization' in request_obj.headers:
auth = request_obj.headers.get('Authorization')
if isinstance(auth, list):
auth = auth[0]
if 'Bearer' in auth:
auth = auth.split(' ')[1]
return auth == api_key
if 'API_KEY' in request_obj.values:
return request_obj.values.get('API_KEY') == api_key
if 'api_key' in request_obj.values:
return request_obj.values.get('api_key') == api_key
return False
@app.route('/pdf/parse', methods=['POST'])
def pdf_parse():
if not validate_key(request):
return jsonify({'message': 'Not Authorized'})
data = request.get_json()
if not isinstance(data['files'], list):
data['files'] = [data['files']]
@ -32,6 +50,11 @@ def pdf_parse():
continue
pdf.remove_encryption(filename, p, temp)
obj = pdf.get_data(temp)
try:
text = th.text_cleanup(pdf.get_text(temp))
except IndexError as ie:
print(ie, file=sys.stderr)
continue
outputs = []
for o in obj:
out = json.loads(o.df.to_json(orient='records'))
@ -48,8 +71,35 @@ def pdf_parse():
out[i] = line
outputs.append(out)
os.remove(temp)
output.append({'filename': file['filename'], 'text': outputs})
return json.dumps(output)
output.append({'bank': text['bank'], 'filename': file['filename'], 'tables': outputs, 'text': text['text']})
return jsonify(output)
@app.route('/cambio/get', methods=['POST'])
def cambios():
if not validate_key(request):
return jsonify({'message': 'Not Authorized'})
data = request.get_json()
valid = {
"CLF": "uf",
"IVP": "ivp",
"USD": "dolar",
"USDo": "dolar_intercambio",
"EUR": "euro",
"IPC": "ipc",
"UTM": "utm",
"IMACEC": "imacec",
"TPM": "tpm",
"CUP": "libra_cobre",
"TZD": "tasa_desempleo",
"BTC": "bitcoin"
}
base_url = 'https://mindicador.cl/api/'
url = f"{base_url}{valid[data['desde']]}/{'-'.join(list(reversed(data['fecha'].split('-'))))}"
res = httpx.get(url)
if res.status_code != httpx.codes.OK:
return jsonify({'error': 'Valor no encontrado.'})
return jsonify(res.json())
if __name__ == '__main__':

View File

@ -1,19 +1,65 @@
import os.path
import time
logging = {
'filename': '/var/log/python/error.log'
}
import traceback
class LOG_LEVEL:
INFO = 'INFO'
WARNING = 'WARNING'
DEBUG = 'DEBUG'
ERROR = 'ERROR'
INFO = 0
WARNING = 1
DEBUG = 2
ERROR = 4
@staticmethod
def desc(level):
mapping = {
LOG_LEVEL.INFO: 'INFO',
LOG_LEVEL.WARNING: 'WARNING',
LOG_LEVEL.DEBUG: 'DEBUG',
LOG_LEVEL.ERROR: 'ERROR'
}
return mapping[level]
def log(message, level=LOG_LEVEL.INFO):
filename = logging['filename']
with open(filename, 'a') as f:
f.write(time.strftime('[%Y-%m-%d %H:%M:%S] ') + ' - ' + level + ': ' + message)
class Logger:
def __init__(self):
self._logs = []
def add_log(self, filename: str, min_level: int = LOG_LEVEL.INFO):
self._logs.append({'log': Log(filename), 'level': min_level})
self._logs.sort(key=lambda e: e['level'])
return self
def log(self, message, level: int = LOG_LEVEL.INFO):
for log in self._logs:
if log['level'] >= level:
log['log'].log(message, level)
class Log:
MAX_SIZE = 10 * 1024 * 1024
def __init__(self, filename: str = '/var/log/python/error.log'):
self._filename = filename
def log(self, message, level: int = LOG_LEVEL.INFO):
if isinstance(message, Exception):
message = traceback.format_exc()
if level < LOG_LEVEL.ERROR:
level = LOG_LEVEL.ERROR
self.rotate_file()
with open(self._filename, 'a') as f:
f.write(time.strftime('[%Y-%m-%d %H:%M:%S] ') + ' - ' + LOG_LEVEL.desc(level=level) + ': ' + message + "\n")
def rotate_file(self):
if not os.path.isfile(self._filename):
return
file_size = os.path.getsize(self._filename)
if file_size > self.MAX_SIZE:
self.next_file()
def next_file(self):
name = self._filename.split('.')
n = 1
if name[-2].isnumeric():
n = int(name[-2]) + 1
self._filename = '.'.join([name[0], str(n), name[-1]])

View File

@ -1,48 +1,112 @@
def text_cleanup(text, filename: str = None):
def text_cleanup(text: str):
if isinstance(text, list):
output = []
for t in text:
output.append(text_cleanup(t, filename=filename))
return output
if filename is None:
return text
if 'bice' in filename.lower():
return bice(text)
if 'scotiabank' in filename.lower():
return scotiabank(text)
return text
text = "\n\n\n".join(text)
if 'bice' in text.lower():
return {'bank': 'BICE', 'text': bice(text)}
if 'scotiabank' in text.lower():
return {'bank': 'Scotiabank', 'text': scotiabank(text)}
if 'TARJETA' in text:
return {'bank': 'Scotiabank', 'text': tarjeta(text)}
return {'bank': 'unknown', 'text': basic(text)}
def bice(text):
lines = text.split("\n\n\n")
print(lines)
return text
lines = [t2.strip() for t in text.split("\n\n\n")
for t1 in t.split("\n\n") for t2 in t1.split("\n") if t2.strip() != '']
output = []
output += extract_from_to(lines, 'NOMBRE DEL CLIENTE', end='LAS CONDES', line_length=3)
ti = [t for t in lines if 'MOVIMIENTOS DE LA CUENTA CORRIENTE' in t][0]
output += extract_from_to(lines, 'LAS CONDES', end=ti, line_length=3)
output += [ti]
ti = [i for i, t in enumerate(lines) if 'FECHA' in t]
output += extract_from_to(lines, ti[0], end=ti[1], line_length=4)
output += extract_from_to(lines, 'RESUMEN DEL PERIODO', end='SALDO INICIAL', line_length=1)
output += extract_from_to(lines, 'SALDO INICIAL', end='LINEA SOBREGIRO AUTORIZADA', line_length=4)
output += extract_from_to(lines, 'LINEA SOBREGIRO AUTORIZADA', end='OBSERVACIONES', line_length=3)
output += extract_from_to(lines, 'OBSERVACIONES', line_length=1)
return output
def scotiabank(text):
words = text.split("\n")
words = split_words(text)
output = [words[0]]
output = output + extract_from_to(words, 'No. CTA.', end='VENCIMIENTO LINEA DE CREDITO', line_length=3)
output = output + extract_from_to(words, 'VENCIMIENTO LINEA DE CREDITO',
end='NOMBRE EJECUTIVO: LILIAN AVILA MANRIQUEZ', line_length=2)
output = output + extract_from_to(words, 'NOMBRE EJECUTIVO: LILIAN AVILA MANRIQUEZ', end='SALDO ANTERIOR',
line_length=1)
output = output + extract_from_to(words, 'SALDO ANTERIOR', end='FECHA', line_length=4)
output = output + extract_data(words, 'FECHA', end='ACTUALICE SIEMPRE ANTECEDENTES LEGALES, ', line_length=6,
merge_list=[['DOCTO', 'No.'], ['SALDO', 'DIARIO']])
[print(li) for li in output]
return text
output += extract_from_to(words, 'No. CTA.', end='VENCIMIENTO LINEA DE CREDITO', line_length=3)
output += extract_from_to(words, 'VENCIMIENTO LINEA DE CREDITO',
end='NOMBRE EJECUTIVO: LILIAN AVILA MANRIQUEZ', line_length=2)
output += extract_from_to(words, 'NOMBRE EJECUTIVO: LILIAN AVILA MANRIQUEZ', end='SALDO ANTERIOR',
line_length=1)
output += extract_from_to(words, 'SALDO ANTERIOR', end='FECHA', line_length=4)
output += extract_data(words, 'FECHA', end='ACTUALICE SIEMPRE ANTECEDENTES LEGALES, ', line_length=6,
merge_list=[['DOCTO', 'No.'], ['SALDO', 'DIARIO']])
output += extract_from_to(words, 'ACTUALICE SIEMPRE ANTECEDENTES LEGALES, ', 1)
return output
def extract_from_to(word_list, start, line_length, end: str = None, merge_list=None):
def tarjeta(text):
words = split_words(text)
output = ['ESTADO DE CUENTA NACIONAL DE TARJETA DE CRÉDITO']
i = [i for i, w in enumerate(words) if 'FECHA ESTADO DE CUENTA' in w][0] + 2
output += extract_from_to(words, 'NOMBRE DEL TITULAR', end=i, line_length=2)
output += ['I. INFORMACIóN GENERAL']
i = [i for i, w in enumerate(words) if 'CUPO TOTAL' in w][1]
output += extract_from_to(words, 'CUPO TOTAL', end=i, line_length=3)
output += extract_from_to(words, i, end='ROTATIVO', line_length=4)
output += extract_from_to(words, 'ROTATIVO', end='TASA INTERÉS VIGENTE', line_length=3)
output += extract_from_to(words, 'TASA INTERÉS VIGENTE',
end='CAE se calcula sobre un supuesto de gasto mensual de UF 20 y pagadero en 12 cuotas.',
line_length=4)
output += extract_from_to(words, 'DESDE', end='PERÍODO FACTURADO', line_length=2)
output += extract_from_to(words, 'PERÍODO FACTURADO', end='II.', line_length=3)
output += ['II. DETALLE']
output += extract_from_to(words, '1. PERÍODO ANTERIOR', end='SALDO ADEUDADO INICIO PERÍODO ANTERIOR', line_length=3)
i = words.index('2. PERÍODO ACTUAL')
output += extract_from_to(words, 'SALDO ADEUDADO INICIO PERÍODO ANTERIOR', end=i - 1, line_length=2,
merge_list=[['MONTO FACTURADO A PAGAR (PERÍODO ANTERIOR)', '(A)']], merge_character=" ")
output += ['2. PERÍODO ACTUAL']
output += extract_from_to(words, 'LUGAR DE', end='1.TOTAL OPERACIONES', line_length=7,
merge_list=[['OPERACIÓN', 'O COBRO'], ['TOTAL A', 'PAGAR'], ['VALOR CUOTA', 'MENSUAL']])
i = words.index('1.TOTAL OPERACIONES') + 3
output += extract_from_to(words, '1.TOTAL OPERACIONES', end=i, line_length=3)
output += extract_from_to(words, i, end='TOTAL PAGOS A LA CUENTA', line_length=7)
i = words.index('TOTAL PAGOS A LA CUENTA') + 2
output += extract_from_to(words, 'TOTAL PAGOS A LA CUENTA', end=i, line_length=2)
output += extract_from_to(words, i, end='TOTAL PAT A LA CUENTA', line_length=8)
i = words.index('TOTAL PAT A LA CUENTA') + 2
output += extract_from_to(words, 'TOTAL PAT A LA CUENTA', end=i, line_length=2)
output += extract_from_to(words, i, end=i + 3, line_length=2,
merge_list=[
['2.PRODUCTOS O SERVICIOS VOLUNTARIAMENTE CONTRATADOS SIN MOVIMIENTOS', '(C)']],
merge_character=" ")
if '3.CARGOS, COMISIONES, IMPUESTOS Y ABONOS' in words:
i = words.index('3.CARGOS, COMISIONES, IMPUESTOS Y ABONOS') + 3
output += extract_from_to(words, '3.CARGOS, COMISIONES, IMPUESTOS Y ABONOS', end=i, line_length=3)
return output
def basic(text):
return split_words(text)
def split_words(text):
if isinstance(text, list):
text = "\n\n\n".join(text)
words = [t.strip() for t in text.split("\n") if t.strip() != '']
return words
def extract_from_to(word_list, start, line_length, end=None, merge_list=None, merge_character="\n"):
if not isinstance(start, int):
start = word_list.index(start)
if end is not None:
return extract_by_line(word_list[word_list.index(start):word_list.index(end)], line_length, merge_list)
return extract_by_line(word_list[word_list.index(start):], line_length, merge_list)
if not isinstance(end, int):
end = word_list.index(end)
return extract_by_line(word_list[start:end], line_length, merge_list, merge_character)
return extract_by_line(word_list[start:], line_length, merge_list, merge_character)
def extract_by_line(word_list, line_length, merge_list=None):
def extract_by_line(word_list, line_length, merge_list=None, merge_character="\n"):
if merge_list is not None:
word_list = merge_words(word_list, merge_list)
word_list = merge_words(word_list, merge_list, merge_character)
output = []
line = []
for k, w in enumerate(word_list):
@ -54,22 +118,39 @@ def extract_by_line(word_list, line_length, merge_list=None):
return output
def merge_words(word_list, merge_list):
def merge_words(word_list, merge_list, merge_character):
for m in merge_list:
i = word_list.index(m[0])
word_list = word_list[:i] + ["\n".join(m)] + word_list[i+len(m):]
ixs = find_words(word_list, m)
if ixs is None:
continue
for i in ixs:
word_list = word_list[:i] + [merge_character.join(m)] + word_list[i + len(m):]
return word_list
def extract_data(word_list, start, line_length, end=None, merge_list=None, date_sep='/'):
def find_words(word_list, find_list):
ixs = [i for i, w in enumerate(word_list) if find_list[0] == w]
output = []
for i in ixs:
mistake = False
for k, m in enumerate(find_list):
if m != word_list[i + k]:
mistake = True
break
if mistake:
continue
output.append(i)
return output
def extract_data(word_list, start, line_length, end=None, merge_list=None, merge_character="\n", date_sep='/'):
word_list = word_list[word_list.index(start):]
if end is not None:
word_list = word_list[:word_list.index(end)]
if merge_list is not None:
word_list = merge_words(word_list, merge_list)
word_list = merge_words(word_list, merge_list, merge_character)
output = []
line = []
line_num = 0
col = 0
for k, w in enumerate(word_list):
if col > 0 and col % line_length == 0:
@ -87,4 +168,5 @@ def extract_data(word_list, start, line_length, end=None, merge_list=None, date_
continue
line.append(w)
col += 1
output.append(line)
return output

View File

@ -3,22 +3,51 @@ import os
import contabilidad.pdf as pdf
import contabilidad.text_handler as th
from contabilidad.log import Logger, LOG_LEVEL
import ai.dictionary as dictionary
from ai.network import AI
def parse_settings(args):
output = {'filename': args.filename}
if not os.path.isfile(output['filename']):
output['filename'] = os.path.realpath(os.path.join(os.path.dirname(__file__), '..', 'data', args.filename))
t = args.filename.split('.')
output['temp'] = os.path.realpath(os.path.join(os.path.dirname(output['filename']), t[0] + '-temp.pdf'))
output['dictionary'] = os.path.join(os.path.dirname(output['filename']), 'dictionary.json')
output['network'] = os.path.join(os.path.dirname(output['filename']), 'network.json')
output['log_file'] = args.log_file
if not os.path.isfile(output['log_file']):
output['log_file'] = os.path.join(os.path.dirname(os.path.dirname(output['filename'])), output['log_file'])
output['error_log_file'] = os.path.join(os.path.dirname(output['log_file']), 'error.log')
output['logger'] = Logger()
output['logger'].add_log(output['log_file']).add_log(output['error_log_file'], LOG_LEVEL.ERROR)
return output
def main(args):
filename = os.path.realpath(os.path.join(os.path.dirname(__file__), '..', 'data', args.filename))
temp = os.path.realpath(os.path.join(os.path.dirname(__file__), '..', 'data', args.temp_filename))
pdf.remove_encryption(filename, args.password, temp)
obj = pdf.get_data(temp)
obj = pdf.get_text(filename, args.password)
text = th.text_cleanup(obj, filename=str(args.filename))
os.remove(temp)
settings = parse_settings(args)
print('Loading AI')
network = AI(settings['dictionary'], settings['logger'])
network.set_filename(settings['network'])
network.add_source({'filename': settings['filename'], 'password': args.password})
network.process_sources()
exit()
print('Loading dictionary.')
dictio = dictionary.Dictionary(settings['dictionary'], settings['logger'])
print('Getting possible phrases.')
dictio.process(settings['filename'], args.password)
dictio.to_data()
# print('Saving dictionary.')
# dictio.save()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-f', '--filename', type=str)
parser.add_argument('-p', '--password', type=str, default='')
parser.add_argument('-t', '--temp_filename', type=str)
parser.add_argument('-l', '--log_file', type=str, default=None)
_args = parser.parse_args()
main(_args)