Python working
This commit is contained in:
@ -2,17 +2,14 @@ FROM python
|
||||
|
||||
RUN apt-get update -y && apt-get install -y ghostscript python3-tk libgl-dev
|
||||
|
||||
RUN pip install flask pyyaml pypdf4 gunicorn camelot-py[cv]
|
||||
RUN pip install flask pyyaml pypdf4 gunicorn camelot-py[cv] pikepdf
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY ./src/ /app/src/
|
||||
|
||||
#ENTRYPOINT ["/bin/bash"]
|
||||
|
||||
EXPOSE 5000
|
||||
|
||||
WORKDIR /app/src
|
||||
|
||||
CMD ["python", "app.py"]
|
||||
#CMD ["gunicorn", "-b 0.0.0.0:5000", "app:app"]
|
||||
CMD ["gunicorn", "-b 0.0.0.0:5000", "app:app"]
|
||||
|
@ -1,5 +1,8 @@
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
from flask import Flask, request
|
||||
|
||||
import contabilidad.pdf as pdf
|
||||
@ -22,16 +25,30 @@ def pdf_parse():
|
||||
output = []
|
||||
for file in data['files']:
|
||||
filename = os.path.realpath(os.path.join('/app/data', file['filename']))
|
||||
texts = []
|
||||
t = file['filename'].split('.')
|
||||
temp = os.path.realpath(os.path.join('/app/data', t[0] + '-temp.pdf'))
|
||||
for p in pwds:
|
||||
obj = pdf.get_data(filename, p)
|
||||
print(obj)
|
||||
obj = pdf.get_text(filename, p)
|
||||
if obj is None:
|
||||
if not pdf.check_password(filename, p):
|
||||
continue
|
||||
text = th.text_cleanup(obj, file['filename'])
|
||||
texts.append(text)
|
||||
output.append({'filename': file['filename'], 'text': texts})
|
||||
pdf.remove_encryption(filename, p, temp)
|
||||
obj = pdf.get_data(temp)
|
||||
outputs = []
|
||||
for o in obj:
|
||||
out = json.loads(o.df.to_json(orient='records'))
|
||||
if out[0]['0'] == 'FECHA':
|
||||
for i, line in enumerate(out):
|
||||
if 'FECHA' in line['0'] or 'ACTUALICE' in line['0']:
|
||||
continue
|
||||
if line['0'] == '':
|
||||
spl = line['1'].split(' ')
|
||||
else:
|
||||
spl = line['0'].split(' ')
|
||||
line['0'] = ' '.join(spl[:3])
|
||||
line['1'] = ' '.join(spl[3:])
|
||||
out[i] = line
|
||||
outputs.append(out)
|
||||
os.remove(temp)
|
||||
output.append({'filename': file['filename'], 'text': outputs})
|
||||
return json.dumps(output)
|
||||
|
||||
|
||||
|
@ -1,7 +1,30 @@
|
||||
import sys
|
||||
|
||||
import camelot
|
||||
import PyPDF4
|
||||
import pikepdf
|
||||
|
||||
|
||||
def is_encrypted(filename):
|
||||
with open(filename, 'rb') as file:
|
||||
reader = PyPDF4.PdfFileReader(file)
|
||||
return reader.getIsEncrypted()
|
||||
|
||||
|
||||
def check_password(filename, password):
|
||||
if not is_encrypted(filename):
|
||||
return True
|
||||
with open(filename, 'rb') as file:
|
||||
reader = PyPDF4.PdfFileReader(file)
|
||||
status = reader.decrypt(str(password))
|
||||
if status == 0:
|
||||
return False
|
||||
return reader.getIsEncrypted()
|
||||
|
||||
|
||||
def remove_encryption(filename, password, new_name):
|
||||
pdf = pikepdf.open(filename, password=str(password))
|
||||
if '.pdf' not in new_name:
|
||||
new_name += '.pdf'
|
||||
pdf.save(new_name)
|
||||
|
||||
|
||||
def get_pdf(file, password=''):
|
||||
@ -24,10 +47,5 @@ def get_text(filename, password=''):
|
||||
return texts
|
||||
|
||||
|
||||
def get_data(filename, password=''):
|
||||
with open(filename, 'rb') as file:
|
||||
reader = PyPDF4.PdfFileReader(file)
|
||||
print(reader, file=sys.stderr)
|
||||
if password != '' and reader.getIsEncrypted():
|
||||
return camelot.read_pdf(filename, password=str(password), pages='all', output_format='json')
|
||||
def get_data(filename):
|
||||
return camelot.read_pdf(filename, pages='all', output_format='json')
|
||||
|
@ -7,16 +7,18 @@ import contabilidad.text_handler as th
|
||||
|
||||
def main(args):
|
||||
filename = os.path.realpath(os.path.join(os.path.dirname(__file__), '..', 'data', args.filename))
|
||||
obj = pdf.get_data(filename, args.password)
|
||||
print(obj)
|
||||
temp = os.path.realpath(os.path.join(os.path.dirname(__file__), '..', 'data', args.temp_filename))
|
||||
pdf.remove_encryption(filename, args.password, temp)
|
||||
obj = pdf.get_data(temp)
|
||||
obj = pdf.get_text(filename, args.password)
|
||||
text = th.text_cleanup(obj, filename=str(args.filename))
|
||||
print(text)
|
||||
os.remove(temp)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('-f', '--filename', type=str)
|
||||
parser.add_argument('-p', '--password', type=str, default='')
|
||||
parser.add_argument('-t', '--temp_filename', type=str)
|
||||
_args = parser.parse_args()
|
||||
main(_args)
|
||||
|
Reference in New Issue
Block a user