Python working

This commit is contained in:
2021-11-02 22:12:25 -03:00
parent b0f7c9b2b1
commit 61448a2521
4 changed files with 58 additions and 24 deletions

View File

@ -2,17 +2,14 @@ FROM python
RUN apt-get update -y && apt-get install -y ghostscript python3-tk libgl-dev
RUN pip install flask pyyaml pypdf4 gunicorn camelot-py[cv]
RUN pip install flask pyyaml pypdf4 gunicorn camelot-py[cv] pikepdf
WORKDIR /app
COPY ./src/ /app/src/
#ENTRYPOINT ["/bin/bash"]
EXPOSE 5000
WORKDIR /app/src
CMD ["python", "app.py"]
#CMD ["gunicorn", "-b 0.0.0.0:5000", "app:app"]
CMD ["gunicorn", "-b 0.0.0.0:5000", "app:app"]

View File

@ -1,5 +1,8 @@
import io
import json
import os
import sys
from flask import Flask, request
import contabilidad.pdf as pdf
@ -22,16 +25,30 @@ def pdf_parse():
output = []
for file in data['files']:
filename = os.path.realpath(os.path.join('/app/data', file['filename']))
texts = []
t = file['filename'].split('.')
temp = os.path.realpath(os.path.join('/app/data', t[0] + '-temp.pdf'))
for p in pwds:
obj = pdf.get_data(filename, p)
print(obj)
obj = pdf.get_text(filename, p)
if obj is None:
if not pdf.check_password(filename, p):
continue
text = th.text_cleanup(obj, file['filename'])
texts.append(text)
output.append({'filename': file['filename'], 'text': texts})
pdf.remove_encryption(filename, p, temp)
obj = pdf.get_data(temp)
outputs = []
for o in obj:
out = json.loads(o.df.to_json(orient='records'))
if out[0]['0'] == 'FECHA':
for i, line in enumerate(out):
if 'FECHA' in line['0'] or 'ACTUALICE' in line['0']:
continue
if line['0'] == '':
spl = line['1'].split(' ')
else:
spl = line['0'].split(' ')
line['0'] = ' '.join(spl[:3])
line['1'] = ' '.join(spl[3:])
out[i] = line
outputs.append(out)
os.remove(temp)
output.append({'filename': file['filename'], 'text': outputs})
return json.dumps(output)

View File

@ -1,7 +1,30 @@
import sys
import camelot
import PyPDF4
import pikepdf
def is_encrypted(filename):
with open(filename, 'rb') as file:
reader = PyPDF4.PdfFileReader(file)
return reader.getIsEncrypted()
def check_password(filename, password):
if not is_encrypted(filename):
return True
with open(filename, 'rb') as file:
reader = PyPDF4.PdfFileReader(file)
status = reader.decrypt(str(password))
if status == 0:
return False
return reader.getIsEncrypted()
def remove_encryption(filename, password, new_name):
pdf = pikepdf.open(filename, password=str(password))
if '.pdf' not in new_name:
new_name += '.pdf'
pdf.save(new_name)
def get_pdf(file, password=''):
@ -24,10 +47,5 @@ def get_text(filename, password=''):
return texts
def get_data(filename, password=''):
with open(filename, 'rb') as file:
reader = PyPDF4.PdfFileReader(file)
print(reader, file=sys.stderr)
if password != '' and reader.getIsEncrypted():
return camelot.read_pdf(filename, password=str(password), pages='all', output_format='json')
def get_data(filename):
return camelot.read_pdf(filename, pages='all', output_format='json')

View File

@ -7,16 +7,18 @@ import contabilidad.text_handler as th
def main(args):
filename = os.path.realpath(os.path.join(os.path.dirname(__file__), '..', 'data', args.filename))
obj = pdf.get_data(filename, args.password)
print(obj)
temp = os.path.realpath(os.path.join(os.path.dirname(__file__), '..', 'data', args.temp_filename))
pdf.remove_encryption(filename, args.password, temp)
obj = pdf.get_data(temp)
obj = pdf.get_text(filename, args.password)
text = th.text_cleanup(obj, filename=str(args.filename))
print(text)
os.remove(temp)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-f', '--filename', type=str)
parser.add_argument('-p', '--password', type=str, default='')
parser.add_argument('-t', '--temp_filename', type=str)
_args = parser.parse_args()
main(_args)