Skip to content

Commit 0665090

Browse files
authored
Merge pull request #8 from pdftables/return-strings
Support eliding output name; return string or bytes
2 parents 736bd27 + 5bbe52e commit 0665090

File tree

2 files changed

+96
-17
lines changed

2 files changed

+96
-17
lines changed

pdftables_api/pdftables_api.py

Lines changed: 47 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616

1717
import requests
1818

19+
from shutil import copyfileobj
20+
1921

2022
FORMAT_CSV = 'csv'
2123
FORMAT_XLSX_MULTIPLE = 'xlsx-multiple'
@@ -37,59 +39,90 @@
3739
'.xlsx': FORMAT_XLSX,
3840
'.xml': FORMAT_XML,
3941
}
40-
42+
_STRING_FORMATS = {FORMAT_CSV, FORMAT_XML}
4143

4244
class Client(object):
4345
def __init__(self, api_key, api_url=_API_URL, timeout=_DEFAULT_TIMEOUT):
4446
self.api_key = api_key
4547
self.api_url = api_url
4648
self.timeout = timeout
4749

48-
def xlsx(self, pdf_path, xlsx_path):
50+
def xlsx(self, pdf_path, xlsx_path=None):
4951
"""
5052
Convenience method to convert PDF to XLSX multiple sheets.
53+
54+
If xlsx_path is None, returns the output as a byte string.
5155
"""
5256
return self.xlsx_multiple(pdf_path, xlsx_path)
5357

54-
def xlsx_single(self, pdf_path, xlsx_path):
58+
def xlsx_single(self, pdf_path, xlsx_path=None):
5559
"""
5660
Convenience method to convert PDF to XLSX single sheet.
61+
62+
If xlsx_path is None, returns the output as a byte string.
5763
"""
5864
return self.convert(pdf_path, xlsx_path, out_format=FORMAT_XLSX_SINGLE)
5965

60-
def xlsx_multiple(self, pdf_path, xlsx_path):
66+
def xlsx_multiple(self, pdf_path, xlsx_path=None):
6167
"""
6268
Convenience method to convert PDF to XLSX multiple sheets.
69+
70+
If xlsx_path is None, returns the output as a byte string.
6371
"""
6472
return self.convert(pdf_path, xlsx_path, out_format=FORMAT_XLSX_MULTIPLE)
6573

66-
def xml(self, pdf_path, xml_path):
74+
def xml(self, pdf_path, xml_path=None):
6775
"""
6876
Convenience method to convert PDF to XML.
77+
78+
If xml_path is None, returns the output as a string.
6979
"""
7080
return self.convert(pdf_path, xml_path, out_format=FORMAT_XML)
7181

72-
def csv(self, pdf_path, csv_path):
82+
def csv(self, pdf_path, csv_path=None):
7383
"""
7484
Convenience method to convert PDF to CSV.
85+
86+
If csv_path is None, returns the output as a string.
7587
"""
7688
return self.convert(pdf_path, csv_path, out_format=FORMAT_CSV)
7789

78-
def convert(self, pdf_path, out_path, out_format=None, query_params=None, **requests_params):
90+
def convert(self, pdf_path, out_path=None, out_format=None, query_params=None, **requests_params):
7991
"""
8092
Convert PDF given by `pdf_path` into `format` at `out_path`.
93+
94+
If `out_path` is None, returns a string containing the contents, or a
95+
bytes for binary output types (e.g, XLSX)
8196
"""
8297
(out_path, out_format) = Client.ensure_format_ext(out_path, out_format)
8398
with open(pdf_path, 'rb') as pdf_fo:
84-
data = self.dump(pdf_fo, out_format, query_params, **requests_params)
99+
response = self.request(pdf_fo, out_format, query_params,
100+
**requests_params)
101+
102+
if out_path is None:
103+
use_text = out_format in _STRING_FORMATS
104+
return response.text if use_text else response.content
105+
85106
with open(out_path, 'wb') as out_fo:
86-
for chunk in data:
87-
if chunk:
88-
out_fo.write(chunk)
107+
converted_fo = response.raw
108+
# Ensure that gzip content is decoded.
109+
converted_fo.decode_content = True
110+
copyfileobj(converted_fo, out_fo)
111+
112+
def dump(self, pdf_fo, out_format=None, query_params=None,
113+
**requests_params):
114+
"""
115+
Convert PDF file object given by `pdf_fo` into an output stream iterator.
116+
"""
117+
response = self.request(pdf_fo, out_format, query_params,
118+
**requests_params)
89119

90-
def dump(self, pdf_fo, out_format=None, query_params=None, **requests_params):
120+
return response.iter_content(chunk_size=4096)
121+
122+
def request(self, pdf_fo, out_format=None, query_params=None,
123+
**requests_params):
91124
"""
92-
Convert PDF given by `pdf_path` into an output stream iterator.
125+
Convert PDF given by `pdf_path`, returning requests.Response object.
93126
"""
94127
if self.api_key == "":
95128
raise APIException("Invalid API key")
@@ -119,7 +152,7 @@ def dump(self, pdf_fo, out_format=None, query_params=None, **requests_params):
119152
raise APIException("Unknown format requested")
120153
response.raise_for_status()
121154

122-
return response.iter_content(chunk_size=4096)
155+
return response
123156

124157
def remaining(self, query_params=None, **requests_params):
125158
"""
@@ -140,7 +173,6 @@ def remaining(self, query_params=None, **requests_params):
140173

141174
return int(response.content)
142175

143-
144176
@staticmethod
145177
def ensure_format_ext(out_path, out_format):
146178
"""

test/test_pdftables_api.py

Lines changed: 49 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,10 @@
1313
# limitations under the License.
1414

1515
import io
16+
import os
17+
18+
from tempfile import NamedTemporaryFile
19+
1620

1721
import requests_mock
1822

@@ -68,8 +72,51 @@ def test_successful_conversion(self):
6872

6973
pdf_fo = io.BytesIO(b'pdf content')
7074
c = Client('fake_key')
71-
s = c.dump(pdf_fo, 'csv')
72-
self.assertEqual(b'xlsx output', consume(s))
75+
76+
with NamedTemporaryFile(suffix="test.pdf") as tf:
77+
filename = tf.name
78+
79+
tf.write(b"Hello world")
80+
tf.file.close()
81+
82+
filename_out = filename.replace(".pdf", ".xlsx")
83+
84+
try:
85+
s = c.convert(filename, filename_out)
86+
87+
with open(filename_out) as fd:
88+
self.assertEqual(fd.read(), "xlsx output")
89+
finally:
90+
try:
91+
os.unlink(filename_out)
92+
except OSError:
93+
pass
94+
95+
def test_successful_conversion_bytes(self):
96+
with requests_mock.mock() as m:
97+
m.post('https://pdftables.com/api?key=fake_key', content=b'xlsx output')
98+
99+
with NamedTemporaryFile(suffix="test.pdf") as tf:
100+
filename = tf.name
101+
tf.write(b"Hello world")
102+
tf.file.close()
103+
104+
output = Client('fake_key').convert(filename)
105+
106+
self.assertEqual(b'xlsx output', output)
107+
108+
def test_successful_conversion_string(self):
109+
with requests_mock.mock() as m:
110+
m.post('https://pdftables.com/api?key=fake_key', text='csv output')
111+
112+
with NamedTemporaryFile(suffix="test.pdf") as tf:
113+
filename = tf.name
114+
tf.write(b"Hello world")
115+
tf.file.close()
116+
117+
output = Client('fake_key').convert(filename, out_format="csv")
118+
119+
self.assertEqual('csv output', output)
73120

74121
def test_different_api_url(self):
75122
with requests_mock.mock() as m:

0 commit comments

Comments
 (0)