Files
contabilidad/python/src/ai/dictionary.py
2021-12-06 22:13:06 -03:00

286 lines
9.7 KiB
Python

import json
import os
import numpy as np
import sklearn
import enlighten
from sklearn.preprocessing import LabelEncoder
import src.contabilidad.pdf as pdf
import src.contabilidad.text_handler as th
from src.ai.models import Phrase, phrase_factory, Word, word_factory
from src.contabilidad.log import LOG_LEVEL
class Dictionary:
def __init__(self, filename, logger):
self.filename = filename
self._logger = logger
self.__processed = []
self.__phrases = None
self.__words = None
self.load()
def load(self):
if not os.path.isfile(self.filename):
return
with open(self.filename, 'r') as file:
data = json.load(file)
if 'words' in data.keys():
self.__words = []
[self.__words.append(word_factory(w)) for w in data['words']]
if 'phrases' in data.keys():
self.__phrases = []
[self.__phrases.append(phrase_factory(ph)) for ph in data['phrases']]
if 'processed' in data.keys():
self.__processed = []
self.__processed = data['processed']
def save(self):
self.sort_words()
self.sort_phrases()
with open(self.filename, 'w') as file:
json.dump(self.to_json(), file, indent=2)
def to_data(self):
encoder = LabelEncoder()
data = encoder.fit_transform([w.get_word() for w in self.get_words()])
[self.__words[i].set_fit(f) for i, f in enumerate(data)]
print(data)
# return [ph.to_data() for ph in self.get_phrases()]
def to_json(self):
output = {
'processed': [],
'words': [],
'phrases': []
}
if self.__processed is not None and len(self.__processed) > 0:
output['processed'] = self.__processed
if self.__words is not None and len(self.__words) > 0:
output['words'] = [w.to_json() for w in self.__words]
if self.__phrases is not None and len(self.__phrases) > 0:
output['phrases'] = [p.to_json() for p in self.__phrases]
return output
def find_phrase(self, phrase: Phrase = None, phrase_dict: dict = None, phrase_list: list = None):
if not self.__phrases:
return -1
if phrase is not None:
phrase_list = [w.get_word() for w in phrase.get_words()]
elif phrase_dict is not None:
phrase_list = phrase_dict['words']
elif phrase_list is not None:
pass
else:
return -1
return find_phrase(self.__phrases, phrase_list)
def add_phrase(self, phrase: Phrase = None, phrase_dict: dict = None, phrase_list: list = None):
if self.__phrases is None:
self.__phrases = []
if phrase is not None:
pass
elif phrase_dict is not None:
phrase = phrase_factory(phrase_dict)
elif phrase_list is not None:
phrase = phrase_factory({'words': phrase_list})
else:
return self
i = self.find_phrase(phrase)
if i > -1:
self.__phrases[i].add_freq()
return self
self.__phrases.append(phrase)
return self
def add_phrases(self, phrase_list: list):
if self.__phrases is None:
self.__phrases = []
phs = [sorted(w.get_word() for w in p) for p in self.__phrases]
with enlighten.get_manager() as manager:
with manager.counter(total=len(phrase_list), desc='Phrases', unit='phrases', color='green') as bar1:
for i, phrase in enumerate(phrase_list):
# print(f'Adding phrase {i}.')
p2 = sorted([w.get_word() for w in phrase])
if p2 in phs:
k = phs.index(p2)
self.__phrases[k].add_freq()
continue
ph = phrase_factory({'words': phrase})
self.__phrases.append(ph)
phs.append(p2)
bar1.update()
def get_phrases(self):
return self.__phrases
def sort_phrases(self):
if self.__phrases is None:
return
try:
def sort_phrase(p):
if p is None:
return 0
if isinstance(p, Phrase):
return p.get_freq(), p.get_type().get_desc(), len(p.get_words())
return p['frequency'], p['type']['description'], len(p['words'])
self.__phrases = sorted(self.__phrases,
key=sort_phrase)
except Exception as e:
self._logger.log(repr(self.__phrases), LOG_LEVEL.ERROR)
self._logger.log(e)
return self
def sort_words(self):
if self.__words is None:
return
try:
def sort_word(w):
if w is None:
return 0
if isinstance(w, Word):
return w.get_freq(), w.get_type().get_desc(), w.get_word()
return w['frequency'], w['type']['description'], w['word']
self.__words = sorted(self.__words, key=sort_word, reverse=True)
except Exception as e:
self._logger.log(repr(self.__words))
self._logger.log(e)
return self
def find_word(self, word: Word = None, word_dict: dict = None, word_str: str = None):
if not self.__words:
return -1
if word is not None:
word_str = word.get_word()
elif word_dict is not None:
word_str = word_dict['word']
elif word_str is not None:
pass
else:
return -1
return find_word(self.__words, word_str)
def add_word(self, word: Word = None, word_dict: dict = None, word_str: str = None):
if self.__words is None:
self.__words = []
if word is not None:
pass
elif word_dict is not None:
word = word_factory(word_dict)
elif word_str is not None:
word = word_factory({'word': word_str})
else:
return self
i = self.find_word(word)
if i > -1:
self.__words[i].add_freq()
return self
self.__words.append(word)
return self
def add_words(self, words: list):
[self.add_word(word=w) for w in words if isinstance(w, Word)]
[self.add_word(word_dict=w) for w in words if isinstance(w, dict)]
[self.add_word(word_str=w) for w in words if isinstance(w, str)]
return self
def get_words(self):
return filter_unique_words(self.__words)
def match_words(self, word_list: list):
new_list = []
for w in word_list:
wi = self.find_word(word_str=w)
new_list.append(self.__words[wi])
return new_list
def append_to_phrase(self, seed: list = None, length: int = 1):
if seed is None:
return [self.__words[0]]
max_index = max(seed) + length
if max_index > len(self.__words):
if length == 1:
return False
return self.append_to_phrase(seed, length - 1)
return seed + self.__words[max_index]
def get_possible_phrases(self, word_list):
print('Adding words.')
self.add_words(word_list)
print('Creating phrases.')
with enlighten.get_manager() as manager:
with manager.counter(total=len(word_list)**2, desc='Phrases', unit='words', color='red') as bar1:
phrases = []
for length in range(1, len(word_list) + 1):
bar2 = bar1.add_subcounter(color='green')
for start in range(0, len(word_list)):
phrase = build_phrase(word_list, start, start + length)
phrase = self.match_words(phrase)
phrases.append(phrase)
start += length
bar2.update()
bar1.update()
print(f'Created {len(phrases)} phrases.')
phrases = sorted(phrases, key=lambda e: len(e))
print('Adding phrases.')
# Really slow (~115000 phrases in one pdf)
self.add_phrases(phrases)
return self.__phrases
def is_processed(self, filename: str):
return os.path.basename(filename) in self.__processed
def process(self, filename: str, password: str = None):
if self.is_processed(filename):
print('Already processed.')
return
t = filename.split('.')
temp = os.path.realpath(os.path.join(os.path.dirname(filename), t[0] + '-temp.pdf'))
print('Removing PDF encryption.')
pdf.remove_encryption(filename, password, temp)
print('Getting text')
obj = pdf.get_text(temp)
os.remove(temp)
print('Getting possible phrases.')
phrases = self.get_possible_phrases(th.split_words(obj))
self.__processed.append(os.path.basename(filename))
return phrases
def build_phrase(word_list, start: int, end: int = None):
if end is None:
return word_list[start:]
return word_list[start:end]
def filter_unique_words(words):
new_list = []
for w in words:
if w not in new_list:
new_list.append(w)
return new_list
def validate_phrase(phrase):
return True
def find_phrase(phrases: list, phrase: list):
phrase_list = [sorted([w.get_word() for w in p.get_words()]) for p in phrases]
sphrase = sorted(phrase)
if sphrase in phrase_list:
return phrase_list.index(sphrase)
return -1
def find_word(words: list, word: str):
word_list = [w.get_word() for w in words]
if word in word_list:
return word_list.index(word)
return -1