Skip to content

Commit 6c2c11a

Browse files
committed
Fixes after code review -- thanks, @WillAyd
1 parent 41c1d82 commit 6c2c11a

File tree

2 files changed

+295
-173
lines changed

2 files changed

+295
-173
lines changed

pandas/io/html.py

Lines changed: 141 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from pandas.errors import EmptyDataError
1515
from pandas.io.common import _is_url, urlopen, _validate_header_arg
1616
from pandas.io.parsers import TextParser
17-
from pandas.compat import (lrange, lmap, lfilter, u, string_types, iteritems,
17+
from pandas.compat import (lrange, lmap, u, string_types, iteritems,
1818
raise_with_traceback, binary_type)
1919
from pandas import Series
2020
import pandas.core.common as com
@@ -189,6 +189,7 @@ class _HtmlFrameParser(object):
189189
-----
190190
To subclass this class effectively you must override the following methods:
191191
* :func:`_build_doc`
192+
* :func:`_attr_getter`
192193
* :func:`_text_getter`
193194
* :func:`_parse_td`
194195
* :func:`_parse_thead_tr`
@@ -208,7 +209,8 @@ def __init__(self, io, match, attrs, encoding, displayed_only):
208209
self.displayed_only = displayed_only
209210

210211
def parse_tables(self):
211-
"""Parse and return all tables from the DOM.
212+
"""
213+
Parse and return all tables from the DOM.
212214
213215
Returns
214216
-------
@@ -217,8 +219,28 @@ def parse_tables(self):
217219
tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
218220
return (self._parse_thead_tbody_tfoot(table) for table in tables)
219221

222+
def _attr_getter(self, obj, attr):
223+
"""
224+
Return the attribute value of an individual DOM node.
225+
226+
Parameters
227+
----------
228+
obj : node-like
229+
A DOM node.
230+
231+
attr : str or unicode
232+
The attribute, such as "colspan"
233+
234+
Returns
235+
-------
236+
text : str or unicode
237+
The attribute value.
238+
"""
239+
raise com.AbstractMethodError(self)
240+
220241
def _text_getter(self, obj):
221-
"""Return the text of an individual DOM node.
242+
"""
243+
Return the text of an individual DOM node.
222244
223245
Parameters
224246
----------
@@ -237,7 +259,8 @@ def _parse_td(self, obj):
237259
238260
Parameters
239261
----------
240-
obj : an HTML row element
262+
obj : node-like
263+
A DOM <tr> node.
241264
242265
Returns
243266
-------
@@ -247,7 +270,8 @@ def _parse_td(self, obj):
247270
raise com.AbstractMethodError(self)
248271

249272
def _parse_thead_tr(self, table):
250-
"""Return the list of thead row elements from the parsed table element.
273+
"""
274+
Return the list of thead row elements from the parsed table element.
251275
252276
Parameters
253277
----------
@@ -260,7 +284,8 @@ def _parse_thead_tr(self, table):
260284
raise com.AbstractMethodError(self)
261285

262286
def _parse_tbody_tr(self, table):
263-
"""Return the list of tbody row elements from the parsed table element.
287+
"""
288+
Return the list of tbody row elements from the parsed table element.
264289
265290
HTML5 table bodies consist of either 0 or more <tbody> elements (which
266291
only contain <tr> elements) or 0 or more <tr> elements. This method
@@ -277,7 +302,8 @@ def _parse_tbody_tr(self, table):
277302
raise com.AbstractMethodError(self)
278303

279304
def _parse_tfoot_tr(self, table):
280-
"""Return the list of tfoot row elements from the parsed table element.
305+
"""
306+
Return the list of tfoot row elements from the parsed table element.
281307
282308
Parameters
283309
----------
@@ -290,7 +316,8 @@ def _parse_tfoot_tr(self, table):
290316
raise com.AbstractMethodError(self)
291317

292318
def _parse_tables(self, doc, match, attrs):
293-
"""Return all tables from the parsed DOM.
319+
"""
320+
Return all tables from the parsed DOM.
294321
295322
Parameters
296323
----------
@@ -314,7 +341,8 @@ def _parse_tables(self, doc, match, attrs):
314341
raise com.AbstractMethodError(self)
315342

316343
def _equals_tag(self, obj, tag):
317-
"""Return whether an individual DOM node matches a tag
344+
"""
345+
Return whether an individual DOM node matches a tag
318346
319347
Parameters
320348
----------
@@ -332,7 +360,8 @@ def _equals_tag(self, obj, tag):
332360
raise com.AbstractMethodError(self)
333361

334362
def _build_doc(self):
335-
"""Return a tree-like object that can be used to iterate over the DOM.
363+
"""
364+
Return a tree-like object that can be used to iterate over the DOM.
336365
337366
Returns
338367
-------
@@ -341,48 +370,49 @@ def _build_doc(self):
341370
raise com.AbstractMethodError(self)
342371

343372
def _parse_thead_tbody_tfoot(self, table_html):
344-
"""Given a table, return parsed header, body, and foot.
345-
Header and body are lists-of-lists. Top level list is a list of
346-
rows. Each row is a list of parsed elements.
347-
348-
Logic: Use <thead>, <tbody>, <tfoot> elements to identify
349-
header, body, and footer, otherwise:
350-
- Put all rows into body
351-
- Move rows from top of body to header only if
352-
all elements inside row are <th>
353-
- Move rows from bottom of body to footer only if
354-
all elements inside row are <th>
373+
"""
374+
Given a table, return parsed header, body, and foot.
375+
376+
Header and body are lists-of-lists. Top level list is a list of
377+
rows. Each row is a list of str text.
378+
379+
Logic: Use <thead>, <tbody>, <tfoot> elements to identify
380+
header, body, and footer, otherwise:
381+
- Put all rows into body
382+
- Move rows from top of body to header only if
383+
all elements inside row are <th>
384+
- Move rows from bottom of body to footer only if
385+
all elements inside row are <th>
355386
356387
Parameters
357388
----------
358-
table_html : a single HTML table element.
389+
table_html : node-like
359390
360391
Returns
361392
-------
362393
tuple of (header, body, footer)
363-
header : list of rows, each of which is a list of parsed
364-
header elements
365-
body : list of rows, each of which is a list of parsed body elements
366-
footer : list of rows, each of which is a list of parsed
367-
footer elements
368394
"""
369395

370396
header_rows = self._parse_thead_tr(table_html)
371397
body_rows = self._parse_tbody_tr(table_html)
372398
footer_rows = self._parse_tfoot_tr(table_html)
373399

400+
def row_is_all_th(row):
401+
return all(self._equals_tag(t, 'th') for t in
402+
self._parse_td(row))
403+
374404
if not header_rows:
375-
# The table has no <thead>. Treat first all-<th> rows as headers.
376-
while body_rows and all(self._equals_tag(t, 'th') for t in
377-
self._parse_td(body_rows[0])):
378-
# this row should be a header row, move it from body to header
405+
# The table has no <thead>. Move the top all-<th> rows from the
406+
# <tbody> to the <thead>. (This is a common case because many
407+
# tables in the wild have no <thead> or <tfoot>
408+
while body_rows and row_is_all_th(body_rows[0]):
379409
header_rows.append(body_rows.pop(0))
380410

381411
if not footer_rows:
382412
# The table has no <tfoot>. Treat last all-<th> rows as footers.
383-
while body_rows and all(self._equals_tag(t, 'th') for t in
384-
self._parse_td(body_rows[-1])):
385-
# this row should be a footer row, move it from body to footer
413+
while body_rows and row_is_all_th(body_rows[-1]):
414+
# .insert(), not .append(): we're moving "bottom of <tbody>" to
415+
# "top of <tfoot>"
386416
footer_rows.insert(0, body_rows.pop())
387417

388418
header = self._expand_colspan_rowspan(header_rows)
@@ -392,8 +422,9 @@ def _parse_thead_tbody_tfoot(self, table_html):
392422
return header, body, footer
393423

394424
def _expand_colspan_rowspan(self, rows):
395-
"""Given a list of <tr>s, return a list of text rows that copy cell
396-
text across rowspans/colspans.
425+
"""
426+
Given a list of <tr>s, return a list of text rows that copy cell
427+
text across rowspans/colspans.
397428
398429
Parameters
399430
----------
@@ -404,50 +435,69 @@ def _expand_colspan_rowspan(self, rows):
404435
res : list of rows, each of which is a list of str in that row
405436
"""
406437

407-
res = []
408-
saved_span = []
409-
for row in rows:
410-
extracted_row = self._parse_td(row)
411-
cols_text = [_remove_whitespace(
412-
self._text_getter(col)) for col in extracted_row]
413-
col_colspans = [int(col.get('colspan', 1))
414-
for col in extracted_row]
415-
col_rowspans = [int(col.get('rowspan', 1))
416-
for col in extracted_row]
417-
# expand cols using col_colspans
418-
# maybe this can be done with a list comprehension, dunno
419-
cols = list(zip(
420-
list(com.flatten(
421-
lmap(lambda text_nc: [text_nc[0]] * text_nc[1],
422-
list(zip(cols_text, col_colspans))))),
423-
list(com.flatten(
424-
lmap(lambda nc_nr: [nc_nr[1]] * nc_nr[0],
425-
list(zip(col_colspans, col_rowspans))))))
426-
)
427-
# cols is now a list of (text, number of rows)
428-
# now insert any previous rowspans
429-
for (col, (text, nr)) in saved_span:
430-
cols.insert(col, (text, nr))
431-
432-
# save next saved_span
433-
def advance_item_to_next_row(item):
434-
(col, (text, nr)) = item
435-
if nr == 1:
436-
return None
437-
else:
438-
return (col, (text, nr - 1))
439-
saved_span = lfilter(lambda i: i is not None,
440-
lmap(advance_item_to_next_row,
441-
list(enumerate(cols))))
442-
cols = [text for (text, nr) in cols]
443-
# generate cols with text only
444-
if any([col != '' for col in cols]):
445-
res.append(cols)
446-
447-
return res
438+
all_texts = [] # list of rows, each a list of str
439+
remainder = [] # list of (index, text, nrows)
440+
441+
for tr in rows:
442+
texts = [] # the output for this row
443+
next_remainder = []
444+
445+
index = 0
446+
tds = self._parse_td(tr)
447+
for td in tds:
448+
# Append texts from previous rows with rowspan>1 that come
449+
# before this <td>
450+
while remainder and remainder[0][0] <= index:
451+
prev_i, prev_text, prev_rowspan = remainder.pop(0)
452+
texts.append(prev_text)
453+
if prev_rowspan > 1:
454+
next_remainder.append((prev_i, prev_text,
455+
prev_rowspan - 1))
456+
index += 1
457+
458+
# Append the text from this <td>, colspan times
459+
text = _remove_whitespace(self._text_getter(td))
460+
rowspan = int(self._attr_getter(td, 'rowspan') or 1)
461+
colspan = int(self._attr_getter(td, 'colspan') or 1)
462+
463+
for _ in range(colspan):
464+
texts.append(text)
465+
if rowspan > 1:
466+
next_remainder.append((index, text, rowspan - 1))
467+
index += 1
468+
469+
# Append texts from previous rows at the final position
470+
for prev_i, prev_text, prev_rowspan in remainder:
471+
texts.append(prev_text)
472+
if prev_rowspan > 1:
473+
next_remainder.append((prev_i, prev_text,
474+
prev_rowspan - 1))
475+
476+
all_texts.append(texts)
477+
remainder = next_remainder
478+
479+
# Append rows that only appear because the previous row had non-1
480+
# rowspan
481+
while remainder:
482+
next_remainder = []
483+
texts = []
484+
for prev_i, prev_text, prev_rowspan in remainder:
485+
texts.append(prev_text)
486+
if prev_rowspan > 1:
487+
next_remainder.append((prev_i, prev_text,
488+
prev_rowspan - 1))
489+
all_texts.append(texts)
490+
remainder = next_remainder
491+
492+
# ignore all-empty-text rows
493+
no_empty = [row for row in all_texts
494+
if any(text for text in row)]
495+
496+
return no_empty
448497

449498
def _handle_hidden_tables(self, tbl_list, attr_name):
450-
"""Returns list of tables, potentially removing hidden elements
499+
"""
500+
Return list of tables, potentially removing hidden elements
451501
452502
Parameters
453503
----------
@@ -515,6 +565,9 @@ def _parse_tables(self, doc, match, attrs):
515565
.format(patt=match.pattern))
516566
return result
517567

568+
def _attr_getter(self, obj, attr):
569+
return obj.get(attr)
570+
518571
def _text_getter(self, obj):
519572
return obj.text
520573

@@ -596,11 +649,14 @@ class _LxmlFrameParser(_HtmlFrameParser):
596649
def __init__(self, *args, **kwargs):
597650
super(_LxmlFrameParser, self).__init__(*args, **kwargs)
598651

652+
def _attr_getter(self, obj, attr):
653+
return obj.get(attr)
654+
599655
def _text_getter(self, obj):
600656
return obj.text_content()
601657

602658
def _parse_td(self, row):
603-
# Look for direct descendents only: the "row" element here may be a
659+
# Look for direct children only: the "row" element here may be a
604660
# <thead> or <tfoot> (see _parse_thead_tr).
605661
return row.xpath('./td|./th')
606662

@@ -694,12 +750,14 @@ def _parse_thead_tr(self, table):
694750
for thead in table.xpath('.//thead'):
695751
rows.extend(thead.xpath('./tr'))
696752

697-
# lxml does not clean up the clearly-erroneous
698-
# <thead><th>foo</th><th>bar</th></thead>.
753+
# HACK: lxml does not clean up the clearly-erroneous
754+
# <thead><th>foo</th><th>bar</th></thead>. (Missing <tr>). Add
755+
# the <thead> and _pretend_ it's a <tr>; _parse_td() will find its
756+
# children as though it's a <tr>.
757+
#
758+
# Better solution would be to use html5lib.
699759
elements_at_root = thead.xpath('./td|./th')
700760
if elements_at_root:
701-
# Pass the entire <thead> as a row. _parse_td() will interpret
702-
# it correctly.
703761
rows.append(thead)
704762

705763
return rows

0 commit comments

Comments
 (0)