Skip to content

UNI/HTML/WIP: add encoding argument to read_html #7323

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 4, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/source/v0.14.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,9 @@ Known Issues
Enhancements
~~~~~~~~~~~~
- Tests for basic reading of public S3 buckets now exist (:issue:`7281`).
- ``read_html`` now sports an ``encoding`` argument that is passed to the
underlying parser library. You can use this to read non-ascii encoded web
pages (:issue:`7323`).

- Support for dateutil timezones, which can now be used in the same way as
pytz timezones across pandas. (:issue:`4688`)
Expand Down
53 changes: 32 additions & 21 deletions pandas/io/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,30 +98,33 @@ def _get_skiprows(skiprows):
type(skiprows).__name__)


def _read(io):
def _read(obj):
"""Try to read from a url, file or string.

Parameters
----------
io : str, unicode, or file-like
obj : str, unicode, or file-like

Returns
-------
raw_text : str
"""
if _is_url(io):
with urlopen(io) as url:
raw_text = url.read()
elif hasattr(io, 'read'):
raw_text = io.read()
elif os.path.isfile(io):
with open(io) as f:
raw_text = f.read()
elif isinstance(io, string_types):
raw_text = io
if _is_url(obj):
with urlopen(obj) as url:
text = url.read()
elif hasattr(obj, 'read'):
text = obj.read()
elif isinstance(obj, string_types):
text = obj
try:
if os.path.isfile(text):
with open(text, 'rb') as f:
return f.read()
except TypeError:
pass
else:
raise TypeError("Cannot read object of type %r" % type(io).__name__)
return raw_text
raise TypeError("Cannot read object of type %r" % type(obj).__name__)
return text


class _HtmlFrameParser(object):
Expand Down Expand Up @@ -165,10 +168,11 @@ class _HtmlFrameParser(object):
See each method's respective documentation for details on their
functionality.
"""
def __init__(self, io, match, attrs):
def __init__(self, io, match, attrs, encoding):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should this default to None, then set to utf-8? (or just not set and leave as None)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it defaults to None (from the read_html entry point) because I didn't want to enforce an encoding if bs4 or lxml can parse it from HTML meta information.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

k...sounds good

self.io = io
self.match = match
self.attrs = attrs
self.encoding = encoding

def parse_tables(self):
tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
Expand Down Expand Up @@ -422,7 +426,8 @@ def _setup_build_doc(self):

def _build_doc(self):
from bs4 import BeautifulSoup
return BeautifulSoup(self._setup_build_doc(), features='html5lib')
return BeautifulSoup(self._setup_build_doc(), features='html5lib',
from_encoding=self.encoding)


def _build_xpath_expr(attrs):
Expand Down Expand Up @@ -519,7 +524,7 @@ def _build_doc(self):
from lxml.html import parse, fromstring, HTMLParser
from lxml.etree import XMLSyntaxError

parser = HTMLParser(recover=False)
parser = HTMLParser(recover=False, encoding=self.encoding)

try:
# try to parse the input in the simplest way
Expand Down Expand Up @@ -689,15 +694,15 @@ def _validate_flavor(flavor):


def _parse(flavor, io, match, header, index_col, skiprows, infer_types,
parse_dates, tupleize_cols, thousands, attrs):
parse_dates, tupleize_cols, thousands, attrs, encoding):
flavor = _validate_flavor(flavor)
compiled_match = re.compile(match) # you can pass a compiled regex here

# hack around python 3 deleting the exception variable
retained = None
for flav in flavor:
parser = _parser_dispatch(flav)
p = parser(io, compiled_match, attrs)
p = parser(io, compiled_match, attrs, encoding)

try:
tables = p.parse_tables()
Expand All @@ -715,7 +720,7 @@ def _parse(flavor, io, match, header, index_col, skiprows, infer_types,

def read_html(io, match='.+', flavor=None, header=None, index_col=None,
skiprows=None, infer_types=None, attrs=None, parse_dates=False,
tupleize_cols=False, thousands=','):
tupleize_cols=False, thousands=',', encoding=None):
r"""Read HTML tables into a ``list`` of ``DataFrame`` objects.

Parameters
Expand Down Expand Up @@ -792,6 +797,12 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
thousands : str, optional
Separator to use to parse thousands. Defaults to ``','``.

encoding : str or None, optional
The encoding used to decode the web page. Defaults to ``None``.``None``
preserves the previous encoding behavior, which depends on the
underlying parser library (e.g., the parser library will try to use
the encoding provided by the document).

Returns
-------
dfs : list of DataFrames
Expand Down Expand Up @@ -837,4 +848,4 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
raise ValueError('cannot skip rows starting from the end of the '
'data (you passed a negative value)')
return _parse(flavor, io, match, header, index_col, skiprows, infer_types,
parse_dates, tupleize_cols, thousands, attrs)
parse_dates, tupleize_cols, thousands, attrs, encoding)
Binary file not shown.
Binary file not shown.
26 changes: 26 additions & 0 deletions pandas/io/tests/data/html_encoding/chinese_utf8.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>0</th>
<th>1</th>
</tr>
</thead>
<tbody>
<tr>
<th>0</th>
<td> 漊煻獌</td>
<td> 漊煻獌</td>
</tr>
<tr>
<th>1</th>
<td> 袟袘觕</td>
<td> 袟袘觕</td>
</tr>
<tr>
<th>2</th>
<td> 埱娵徖</td>
<td> 埱娵徖</td>
</tr>
</tbody>
</table>
26 changes: 26 additions & 0 deletions pandas/io/tests/data/html_encoding/letz_latin1.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>0</th>
<th>1</th>
</tr>
</thead>
<tbody>
<tr>
<th>0</th>
<td> G�t</td>
<td> G�t</td>
</tr>
<tr>
<th>1</th>
<td> m�</td>
<td> m�</td>
</tr>
<tr>
<th>2</th>
<td> iech</td>
<td> iech</td>
</tr>
</tbody>
</table>
Loading