From 92aeb817c4501f4787825fa861cbe2f264b290bc Mon Sep 17 00:00:00 2001 From: donknight <31926144+donknight@users.noreply.github.com> Date: Thu, 14 Sep 2017 09:50:05 -0400 Subject: [PATCH 1/4] Make pdftotext check flexible Change pdftotext check from single path to environment PATH variable to better support for non-default installs. Add XPDF to path check to support Windows install of the XPDF port of pdftotext. --- xpdf_python/check_xpdf.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/xpdf_python/check_xpdf.py b/xpdf_python/check_xpdf.py index ce9629f..587ada1 100644 --- a/xpdf_python/check_xpdf.py +++ b/xpdf_python/check_xpdf.py @@ -1,7 +1,8 @@ import os import sys -if os.path.isfile('/usr/local/bin/pdftotext'): +# should cover windows, linux, or mac distros using either the pdftotext distro or the XPDF tools distro +if any(i in os.environ['PATH'] for i in ('pdftotext', 'XPDF')): pass else: - sys.exit("Did not detect correctly installed xpdf. Please follow install instructions at: https://github.com/ecatkins/xpdf_python.") \ No newline at end of file + sys.exit("Did not detect correctly installed xpdf. Please follow install instructions at: https://github.com/ecatkins/xpdf_python.") From 5f68c1fa49b4f7545e6a687a0865d86986cef033 Mon Sep 17 00:00:00 2001 From: donknight <31926144+donknight@users.noreply.github.com> Date: Thu, 14 Sep 2017 10:30:06 -0400 Subject: [PATCH 2/4] Fix saved_file bugs Move saved_file path creation outside loop. Only needs to happen once. Change from replace to fix issues on some systems where capitalized extensions cause overwriting of the read file with the saved_file. Wrap file operation in with statement to fix single page blank pdf from retaining file lock on saved_file --- xpdf_python/wrapper.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/xpdf_python/wrapper.py b/xpdf_python/wrapper.py index 792e73a..a978d94 100644 --- a/xpdf_python/wrapper.py +++ b/xpdf_python/wrapper.py @@ -35,6 +35,9 @@ def to_text(file_loc, page_nums = True): cd = os.getcwd() full_file_loc = os.path.join(cd, file_loc) + path, file = os.path.split(full_file_loc) + saved_file = os.path.join(path, os.path.splitext(file)[0] + '.txt') + text = '' actual_count = 0 @@ -49,10 +52,10 @@ def to_text(file_loc, page_nums = True): actual = i + 1 # Calls xpdf subprocess.call(['pdftotext', '-f', str(actual),'-l', str(actual), full_file_loc]) - # Opens file saved to disk - saved_file = full_file_loc.replace('.pdf','.txt') - file = open(saved_file,'r', encoding = "ISO-8859-1") - t = file.read() + + # Opens file saved to disk, ensures it will always close when done + with open(saved_file, 'r', encoding='ISO-8859-1') as file: + t = file.read() # If the page is blank, it is not a real page if t == '': continue @@ -60,7 +63,7 @@ def to_text(file_loc, page_nums = True): actual_count += 1 # Add text and page count to existing string text += '***Page {}*** {}'.format(actual, t) - file.close() + else: # TO BE IMPLEMENTED pass From 293a35fa8cdfcea13bc20f3380a313ff3b01baf5 Mon Sep 17 00:00:00 2001 From: donknight <31926144+donknight@users.noreply.github.com> Date: Thu, 14 Sep 2017 10:45:26 -0400 Subject: [PATCH 3/4] Implement page_num=False behavior Add behavior for not prepending pagenumbers to each page. Add options arg to allow passing options to pdftotext call --- xpdf_python/wrapper.py | 48 ++++++++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/xpdf_python/wrapper.py b/xpdf_python/wrapper.py index a978d94..cf7baf8 100644 --- a/xpdf_python/wrapper.py +++ b/xpdf_python/wrapper.py @@ -13,13 +13,14 @@ def countPages(filename): data = open(filename,"r", encoding = "ISO-8859-1").read() return len(rxcountpages.findall(data)) -def to_text(file_loc, page_nums = True): +def to_text(file_loc, page_nums=True, options=()): ''' Converts PDF to text Args - - - - - - - file_loc: path to pdf document, string page_nums: whether to insert page numbers into document, boolean + options: allows the addition of any of the normal options accepted by pdftotext, tuple of strings Returns - - - - - - - @@ -43,30 +44,37 @@ def to_text(file_loc, page_nums = True): # If page numbers are to be inserted if page_nums: - # Count number of pages num = countPages(full_file_loc) # Accounts for errors occuring in countPages function if num == 0: num = 100 - for i in range(num): - actual = i + 1 - # Calls xpdf - subprocess.call(['pdftotext', '-f', str(actual),'-l', str(actual), full_file_loc]) - - # Opens file saved to disk, ensures it will always close when done - with open(saved_file, 'r', encoding='ISO-8859-1') as file: - t = file.read() - # If the page is blank, it is not a real page - if t == '': - continue - else: - actual_count += 1 - # Add text and page count to existing string - text += '***Page {}*** {}'.format(actual, t) - else: - # TO BE IMPLEMENTED - pass + # accounts for not adding page numbers by allowing the loop to go just one bulk action + num = 1 + actual_count = countPages(full_file_loc) # try and provide a page estimate on bulk runs + + for i in range(num): + actual = i + 1 + opt = options + if page_nums: + opt += ('-f', str(actual), '-l', str(actual)) + + # Calls xpdf + subprocess.call(['pdftotext', *opt, full_file_loc]) + + # Opens file saved to disk, ensures it will always close when done + with open(saved_file, 'r', encoding='ISO-8859-1') as file: + t = file.read() + # If the page is blank, it is not a real page + if t == '': + continue + if page_nums: + actual_count += 1 + # Add text and page count to existing string, or not it not page_nums + if page_nums: + text += '***Page {}*** {}'.format(actual, t) + else: + text = t # Remove file saved to disk os.remove(saved_file) From 063519fb5ae6a667975ebb43af054e52eed05dad Mon Sep 17 00:00:00 2001 From: donknight <31926144+donknight@users.noreply.github.com> Date: Thu, 14 Sep 2017 12:38:07 -0400 Subject: [PATCH 4/4] Reformat pdftotext check Change the check to look a little saner. If: pass else: dostuff hurts my brain a little. --- xpdf_python/check_xpdf.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/xpdf_python/check_xpdf.py b/xpdf_python/check_xpdf.py index 587ada1..ad21254 100644 --- a/xpdf_python/check_xpdf.py +++ b/xpdf_python/check_xpdf.py @@ -2,7 +2,5 @@ import sys # should cover windows, linux, or mac distros using either the pdftotext distro or the XPDF tools distro -if any(i in os.environ['PATH'] for i in ('pdftotext', 'XPDF')): - pass -else: +if not any(i in os.environ['PATH'] for i in ('pdftotext', 'XPDF')): sys.exit("Did not detect correctly installed xpdf. Please follow install instructions at: https://github.com/ecatkins/xpdf_python.")