diff --git a/xpdf_python/check_xpdf.py b/xpdf_python/check_xpdf.py index ce9629f..ad21254 100644 --- a/xpdf_python/check_xpdf.py +++ b/xpdf_python/check_xpdf.py @@ -1,7 +1,6 @@ import os import sys -if os.path.isfile('/usr/local/bin/pdftotext'): - pass -else: - sys.exit("Did not detect correctly installed xpdf. Please follow install instructions at: https://github.com/ecatkins/xpdf_python.") \ No newline at end of file +# should cover windows, linux, or mac distros using either the pdftotext distro or the XPDF tools distro +if not any(i in os.environ['PATH'] for i in ('pdftotext', 'XPDF')): + sys.exit("Did not detect correctly installed xpdf. Please follow install instructions at: https://github.com/ecatkins/xpdf_python.") diff --git a/xpdf_python/wrapper.py b/xpdf_python/wrapper.py index 792e73a..cf7baf8 100644 --- a/xpdf_python/wrapper.py +++ b/xpdf_python/wrapper.py @@ -13,13 +13,14 @@ def countPages(filename): data = open(filename,"r", encoding = "ISO-8859-1").read() return len(rxcountpages.findall(data)) -def to_text(file_loc, page_nums = True): +def to_text(file_loc, page_nums=True, options=()): ''' Converts PDF to text Args - - - - - - - file_loc: path to pdf document, string page_nums: whether to insert page numbers into document, boolean + options: allows the addition of any of the normal options accepted by pdftotext, tuple of strings Returns - - - - - - - @@ -35,35 +36,45 @@ def to_text(file_loc, page_nums = True): cd = os.getcwd() full_file_loc = os.path.join(cd, file_loc) + path, file = os.path.split(full_file_loc) + saved_file = os.path.join(path, os.path.splitext(file)[0] + '.txt') + text = '' actual_count = 0 # If page numbers are to be inserted if page_nums: - # Count number of pages num = countPages(full_file_loc) # Accounts for errors occuring in countPages function if num == 0: num = 100 - for i in range(num): - actual = i + 1 - # Calls xpdf - subprocess.call(['pdftotext', '-f', str(actual),'-l', str(actual), full_file_loc]) - # Opens file saved to disk - saved_file = full_file_loc.replace('.pdf','.txt') - file = open(saved_file,'r', encoding = "ISO-8859-1") + else: + # accounts for not adding page numbers by allowing the loop to go just one bulk action + num = 1 + actual_count = countPages(full_file_loc) # try and provide a page estimate on bulk runs + + for i in range(num): + actual = i + 1 + opt = options + if page_nums: + opt += ('-f', str(actual), '-l', str(actual)) + + # Calls xpdf + subprocess.call(['pdftotext', *opt, full_file_loc]) + + # Opens file saved to disk, ensures it will always close when done + with open(saved_file, 'r', encoding='ISO-8859-1') as file: t = file.read() - # If the page is blank, it is not a real page - if t == '': - continue - else: - actual_count += 1 - # Add text and page count to existing string + # If the page is blank, it is not a real page + if t == '': + continue + if page_nums: + actual_count += 1 + # Add text and page count to existing string, or not it not page_nums + if page_nums: text += '***Page {}*** {}'.format(actual, t) - file.close() - else: - # TO BE IMPLEMENTED - pass + else: + text = t # Remove file saved to disk os.remove(saved_file)