pylokid/pylokid/library/pdftotext.py

142 lines
4.6 KiB
Python

#!/usr/bin/env python3
""" extracts data from ELZ PDFs using Poppler pdftotext """
import subprocess
import logging
class PDFParsing:
""" PDF parsing """
def __init__(self):
self.logger = logging.getLogger(__name__)
self.logger.info('PDF parsing based on pdftotext loaded')
def extract(self, f_id, file, datafields):
self.logger.info('[%s] parsing PDF file %s', f_id, file)
data = {}
for field, coordinate in datafields.items():
# x-coordinate of the crop area top left corner
x = coordinate['xMin']
# y-coordinate of the crop area top left corner
y = coordinate['yMin']
# width of crop area in pixels
w = coordinate['xMax'] - coordinate['xMin']
# height of crop area in pixels
h = coordinate['yMax'] - coordinate['yMin']
self.logger.debug('[%s] Computed command for field %s: %s', f_id, field,
'pdftotext -f 1 -l 1 -x {} -y {} -W {} -H {}'.format(x,y,w,h)
)
scrapeddata = subprocess.Popen([
'/usr/bin/pdftotext',
'-f', '1',
'-l', '1',
'-x', str(x),
'-y', str(y),
'-W', str(w),
'-H', str(h),
file,
'-'
],
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True)
stdout, _ = scrapeddata.communicate()
## TODO: fixup some fields (lowercase, remove unnecessary \n)
if 'edit' in coordinate and coordinate['edit'] == 'title':
data[field] = stdout.rstrip().title()
else:
data[field] = stdout.rstrip()
# sanity check to see if we can correlate the f_id
if f_id == data['auftrag']:
self.logger.debug('[%s] ID matches in PDF', f_id)
return data
else:
self.logger.error('[%s] ID does not match in PDF: "%s"', f_id, data['auftrag'])
return False
def extract_einsatzausdruck(self, file, f_id):
""" extracts information from Einsatzausdruck using external pdftotext """
self.logger.debug('[%s] Parsing PDF: %s', f_id, file)
# Get them using 'pdftotext -bbox'
# y = row
# x = column: xMax 450 / 590 means full width
coordinates = {
'auftrag': {
'xMin': 70, 'yMin': 47, 'xMax': 120,'yMax': 58,
},
'datum': {
'xMin': 190, 'yMin': 47, 'xMax': 239, 'yMax': 58,
},
'zeit': {
'xMin': 190, 'yMin': 59, 'xMax': 215, 'yMax': 70,
},
'melder': {
'xMin': 304, 'yMin': 47, 'xMax': 446, 'yMax': 70, 'edit': 'title'
},
'erfasser':{
'xMin': 448, 'yMin': 59, 'xMax': 478, 'yMax': 70,
},
# big field until "Disponierte Einheiten"
'bemerkungen': {
'xMin': 28, 'yMin': 112, 'xMax': 590, 'yMax': 350,
},
'disponierteeinheiten': {
'xMin': 28, 'yMin': 366, 'xMax': 450, 'yMax': 376,
},
'einsatz': {
'xMin': 76, 'yMin': 690, 'xMax': 450, 'yMax': 703,
},
'sondersignal': {
'xMin': 76, 'yMin': 707, 'xMax': 450, 'yMax': 721,
},
'ort': {
'xMin': 76, 'yMin': 732, 'xMax': 590, 'yMax': 745,
},
'hinweis': {
'xMin': 76, 'yMin': 773, 'xMax': 450, 'yMax': 787,
},
}
return self.extract(f_id, file, coordinates)
def extract_einsatzprotokoll(self, file, f_id):
""" extracts information from Einsatzprotokoll using external pdftotext """
self.logger.debug('[%s] Parsing PDF: %s', f_id, file)
# Get them using 'pdftotext -bbox'
# y = row
# x = column: xMax 450 / 590 means full width
coordinates = {
'auftrag': {
'xMin': 192, 'yMin': 132, 'xMax': 238,'yMax': 142,
},
'angelegt': {
'xMin': 192, 'yMin': 294, 'xMax': 226, 'yMax': 304,
},
'dispo': {
'xMin': 192, 'yMin': 312, 'xMax': 226, 'yMax': 322,
},
'ausgerueckt': {
'xMin': 192, 'yMin': 331, 'xMax': 226, 'yMax': 341,
},
'vorort':{
'xMin': 192, 'yMin': 348, 'xMax': 226, 'yMax': 358,
},
}
return self.extract(f_id, file, coordinates)