14
14
from pandas .errors import EmptyDataError
15
15
from pandas .io .common import _is_url , urlopen , _validate_header_arg
16
16
from pandas .io .parsers import TextParser
17
- from pandas .compat import (lrange , lmap , lfilter , u , string_types , iteritems ,
17
+ from pandas .compat import (lrange , lmap , u , string_types , iteritems ,
18
18
raise_with_traceback , binary_type )
19
19
from pandas import Series
20
20
import pandas .core .common as com
@@ -189,6 +189,7 @@ class _HtmlFrameParser(object):
189
189
-----
190
190
To subclass this class effectively you must override the following methods:
191
191
* :func:`_build_doc`
192
+ * :func:`_attr_getter`
192
193
* :func:`_text_getter`
193
194
* :func:`_parse_td`
194
195
* :func:`_parse_thead_tr`
@@ -208,7 +209,8 @@ def __init__(self, io, match, attrs, encoding, displayed_only):
208
209
self .displayed_only = displayed_only
209
210
210
211
def parse_tables (self ):
211
- """Parse and return all tables from the DOM.
212
+ """
213
+ Parse and return all tables from the DOM.
212
214
213
215
Returns
214
216
-------
@@ -217,8 +219,28 @@ def parse_tables(self):
217
219
tables = self ._parse_tables (self ._build_doc (), self .match , self .attrs )
218
220
return (self ._parse_thead_tbody_tfoot (table ) for table in tables )
219
221
222
+ def _attr_getter (self , obj , attr ):
223
+ """
224
+ Return the attribute value of an individual DOM node.
225
+
226
+ Parameters
227
+ ----------
228
+ obj : node-like
229
+ A DOM node.
230
+
231
+ attr : str or unicode
232
+ The attribute, such as "colspan"
233
+
234
+ Returns
235
+ -------
236
+ text : str or unicode
237
+ The attribute value.
238
+ """
239
+ raise com .AbstractMethodError (self )
240
+
220
241
def _text_getter (self , obj ):
221
- """Return the text of an individual DOM node.
242
+ """
243
+ Return the text of an individual DOM node.
222
244
223
245
Parameters
224
246
----------
@@ -237,7 +259,8 @@ def _parse_td(self, obj):
237
259
238
260
Parameters
239
261
----------
240
- obj : an HTML row element
262
+ obj : node-like
263
+ A DOM <tr> node.
241
264
242
265
Returns
243
266
-------
@@ -247,7 +270,8 @@ def _parse_td(self, obj):
247
270
raise com .AbstractMethodError (self )
248
271
249
272
def _parse_thead_tr (self , table ):
250
- """Return the list of thead row elements from the parsed table element.
273
+ """
274
+ Return the list of thead row elements from the parsed table element.
251
275
252
276
Parameters
253
277
----------
@@ -260,7 +284,8 @@ def _parse_thead_tr(self, table):
260
284
raise com .AbstractMethodError (self )
261
285
262
286
def _parse_tbody_tr (self , table ):
263
- """Return the list of tbody row elements from the parsed table element.
287
+ """
288
+ Return the list of tbody row elements from the parsed table element.
264
289
265
290
HTML5 table bodies consist of either 0 or more <tbody> elements (which
266
291
only contain <tr> elements) or 0 or more <tr> elements. This method
@@ -277,7 +302,8 @@ def _parse_tbody_tr(self, table):
277
302
raise com .AbstractMethodError (self )
278
303
279
304
def _parse_tfoot_tr (self , table ):
280
- """Return the list of tfoot row elements from the parsed table element.
305
+ """
306
+ Return the list of tfoot row elements from the parsed table element.
281
307
282
308
Parameters
283
309
----------
@@ -290,7 +316,8 @@ def _parse_tfoot_tr(self, table):
290
316
raise com .AbstractMethodError (self )
291
317
292
318
def _parse_tables (self , doc , match , attrs ):
293
- """Return all tables from the parsed DOM.
319
+ """
320
+ Return all tables from the parsed DOM.
294
321
295
322
Parameters
296
323
----------
@@ -314,7 +341,8 @@ def _parse_tables(self, doc, match, attrs):
314
341
raise com .AbstractMethodError (self )
315
342
316
343
def _equals_tag (self , obj , tag ):
317
- """Return whether an individual DOM node matches a tag
344
+ """
345
+ Return whether an individual DOM node matches a tag
318
346
319
347
Parameters
320
348
----------
@@ -332,7 +360,8 @@ def _equals_tag(self, obj, tag):
332
360
raise com .AbstractMethodError (self )
333
361
334
362
def _build_doc (self ):
335
- """Return a tree-like object that can be used to iterate over the DOM.
363
+ """
364
+ Return a tree-like object that can be used to iterate over the DOM.
336
365
337
366
Returns
338
367
-------
@@ -341,48 +370,49 @@ def _build_doc(self):
341
370
raise com .AbstractMethodError (self )
342
371
343
372
def _parse_thead_tbody_tfoot (self , table_html ):
344
- """Given a table, return parsed header, body, and foot.
345
- Header and body are lists-of-lists. Top level list is a list of
346
- rows. Each row is a list of parsed elements.
347
-
348
- Logic: Use <thead>, <tbody>, <tfoot> elements to identify
349
- header, body, and footer, otherwise:
350
- - Put all rows into body
351
- - Move rows from top of body to header only if
352
- all elements inside row are <th>
353
- - Move rows from bottom of body to footer only if
354
- all elements inside row are <th>
373
+ """
374
+ Given a table, return parsed header, body, and foot.
375
+
376
+ Header and body are lists-of-lists. Top level list is a list of
377
+ rows. Each row is a list of str text.
378
+
379
+ Logic: Use <thead>, <tbody>, <tfoot> elements to identify
380
+ header, body, and footer, otherwise:
381
+ - Put all rows into body
382
+ - Move rows from top of body to header only if
383
+ all elements inside row are <th>
384
+ - Move rows from bottom of body to footer only if
385
+ all elements inside row are <th>
355
386
356
387
Parameters
357
388
----------
358
- table_html : a single HTML table element.
389
+ table_html : node-like
359
390
360
391
Returns
361
392
-------
362
393
tuple of (header, body, footer)
363
- header : list of rows, each of which is a list of parsed
364
- header elements
365
- body : list of rows, each of which is a list of parsed body elements
366
- footer : list of rows, each of which is a list of parsed
367
- footer elements
368
394
"""
369
395
370
396
header_rows = self ._parse_thead_tr (table_html )
371
397
body_rows = self ._parse_tbody_tr (table_html )
372
398
footer_rows = self ._parse_tfoot_tr (table_html )
373
399
400
+ def row_is_all_th (row ):
401
+ return all (self ._equals_tag (t , 'th' ) for t in
402
+ self ._parse_td (row ))
403
+
374
404
if not header_rows :
375
- # The table has no <thead>. Treat first all-<th> rows as headers.
376
- while body_rows and all ( self . _equals_tag ( t , 'th' ) for t in
377
- self . _parse_td ( body_rows [ 0 ])):
378
- # this row should be a header row, move it from body to header
405
+ # The table has no <thead>. Move the top all-<th> rows from the
406
+ # <tbody> to the <thead>. (This is a common case because many
407
+ # tables in the wild have no <thead> or <tfoot>
408
+ while body_rows and row_is_all_th ( body_rows [ 0 ]):
379
409
header_rows .append (body_rows .pop (0 ))
380
410
381
411
if not footer_rows :
382
412
# The table has no <tfoot>. Treat last all-<th> rows as footers.
383
- while body_rows and all ( self . _equals_tag ( t , 'th' ) for t in
384
- self . _parse_td ( body_rows [ - 1 ])):
385
- # this row should be a footer row, move it from body to footer
413
+ while body_rows and row_is_all_th ( body_rows [ - 1 ]):
414
+ # .insert(), not .append(): we're moving "bottom of <tbody>" to
415
+ # "top of <tfoot>"
386
416
footer_rows .insert (0 , body_rows .pop ())
387
417
388
418
header = self ._expand_colspan_rowspan (header_rows )
@@ -392,8 +422,9 @@ def _parse_thead_tbody_tfoot(self, table_html):
392
422
return header , body , footer
393
423
394
424
def _expand_colspan_rowspan (self , rows ):
395
- """Given a list of <tr>s, return a list of text rows that copy cell
396
- text across rowspans/colspans.
425
+ """
426
+ Given a list of <tr>s, return a list of text rows that copy cell
427
+ text across rowspans/colspans.
397
428
398
429
Parameters
399
430
----------
@@ -404,50 +435,69 @@ def _expand_colspan_rowspan(self, rows):
404
435
res : list of rows, each of which is a list of str in that row
405
436
"""
406
437
407
- res = []
408
- saved_span = []
409
- for row in rows :
410
- extracted_row = self ._parse_td (row )
411
- cols_text = [_remove_whitespace (
412
- self ._text_getter (col )) for col in extracted_row ]
413
- col_colspans = [int (col .get ('colspan' , 1 ))
414
- for col in extracted_row ]
415
- col_rowspans = [int (col .get ('rowspan' , 1 ))
416
- for col in extracted_row ]
417
- # expand cols using col_colspans
418
- # maybe this can be done with a list comprehension, dunno
419
- cols = list (zip (
420
- list (com .flatten (
421
- lmap (lambda text_nc : [text_nc [0 ]] * text_nc [1 ],
422
- list (zip (cols_text , col_colspans ))))),
423
- list (com .flatten (
424
- lmap (lambda nc_nr : [nc_nr [1 ]] * nc_nr [0 ],
425
- list (zip (col_colspans , col_rowspans ))))))
426
- )
427
- # cols is now a list of (text, number of rows)
428
- # now insert any previous rowspans
429
- for (col , (text , nr )) in saved_span :
430
- cols .insert (col , (text , nr ))
431
-
432
- # save next saved_span
433
- def advance_item_to_next_row (item ):
434
- (col , (text , nr )) = item
435
- if nr == 1 :
436
- return None
437
- else :
438
- return (col , (text , nr - 1 ))
439
- saved_span = lfilter (lambda i : i is not None ,
440
- lmap (advance_item_to_next_row ,
441
- list (enumerate (cols ))))
442
- cols = [text for (text , nr ) in cols ]
443
- # generate cols with text only
444
- if any ([col != '' for col in cols ]):
445
- res .append (cols )
446
-
447
- return res
438
+ all_texts = [] # list of rows, each a list of str
439
+ remainder = [] # list of (index, text, nrows)
440
+
441
+ for tr in rows :
442
+ texts = [] # the output for this row
443
+ next_remainder = []
444
+
445
+ index = 0
446
+ tds = self ._parse_td (tr )
447
+ for td in tds :
448
+ # Append texts from previous rows with rowspan>1 that come
449
+ # before this <td>
450
+ while remainder and remainder [0 ][0 ] <= index :
451
+ prev_i , prev_text , prev_rowspan = remainder .pop (0 )
452
+ texts .append (prev_text )
453
+ if prev_rowspan > 1 :
454
+ next_remainder .append ((prev_i , prev_text ,
455
+ prev_rowspan - 1 ))
456
+ index += 1
457
+
458
+ # Append the text from this <td>, colspan times
459
+ text = _remove_whitespace (self ._text_getter (td ))
460
+ rowspan = int (self ._attr_getter (td , 'rowspan' ) or 1 )
461
+ colspan = int (self ._attr_getter (td , 'colspan' ) or 1 )
462
+
463
+ for _ in range (colspan ):
464
+ texts .append (text )
465
+ if rowspan > 1 :
466
+ next_remainder .append ((index , text , rowspan - 1 ))
467
+ index += 1
468
+
469
+ # Append texts from previous rows at the final position
470
+ for prev_i , prev_text , prev_rowspan in remainder :
471
+ texts .append (prev_text )
472
+ if prev_rowspan > 1 :
473
+ next_remainder .append ((prev_i , prev_text ,
474
+ prev_rowspan - 1 ))
475
+
476
+ all_texts .append (texts )
477
+ remainder = next_remainder
478
+
479
+ # Append rows that only appear because the previous row had non-1
480
+ # rowspan
481
+ while remainder :
482
+ next_remainder = []
483
+ texts = []
484
+ for prev_i , prev_text , prev_rowspan in remainder :
485
+ texts .append (prev_text )
486
+ if prev_rowspan > 1 :
487
+ next_remainder .append ((prev_i , prev_text ,
488
+ prev_rowspan - 1 ))
489
+ all_texts .append (texts )
490
+ remainder = next_remainder
491
+
492
+ # ignore all-empty-text rows
493
+ no_empty = [row for row in all_texts
494
+ if any (text for text in row )]
495
+
496
+ return no_empty
448
497
449
498
def _handle_hidden_tables (self , tbl_list , attr_name ):
450
- """Returns list of tables, potentially removing hidden elements
499
+ """
500
+ Return list of tables, potentially removing hidden elements
451
501
452
502
Parameters
453
503
----------
@@ -515,6 +565,9 @@ def _parse_tables(self, doc, match, attrs):
515
565
.format (patt = match .pattern ))
516
566
return result
517
567
568
+ def _attr_getter (self , obj , attr ):
569
+ return obj .get (attr )
570
+
518
571
def _text_getter (self , obj ):
519
572
return obj .text
520
573
@@ -596,11 +649,14 @@ class _LxmlFrameParser(_HtmlFrameParser):
596
649
def __init__ (self , * args , ** kwargs ):
597
650
super (_LxmlFrameParser , self ).__init__ (* args , ** kwargs )
598
651
652
+ def _attr_getter (self , obj , attr ):
653
+ return obj .get (attr )
654
+
599
655
def _text_getter (self , obj ):
600
656
return obj .text_content ()
601
657
602
658
def _parse_td (self , row ):
603
- # Look for direct descendents only: the "row" element here may be a
659
+ # Look for direct children only: the "row" element here may be a
604
660
# <thead> or <tfoot> (see _parse_thead_tr).
605
661
return row .xpath ('./td|./th' )
606
662
@@ -694,12 +750,14 @@ def _parse_thead_tr(self, table):
694
750
for thead in table .xpath ('.//thead' ):
695
751
rows .extend (thead .xpath ('./tr' ))
696
752
697
- # lxml does not clean up the clearly-erroneous
698
- # <thead><th>foo</th><th>bar</th></thead>.
753
+ # HACK: lxml does not clean up the clearly-erroneous
754
+ # <thead><th>foo</th><th>bar</th></thead>. (Missing <tr>). Add
755
+ # the <thead> and _pretend_ it's a <tr>; _parse_td() will find its
756
+ # children as though it's a <tr>.
757
+ #
758
+ # Better solution would be to use html5lib.
699
759
elements_at_root = thead .xpath ('./td|./th' )
700
760
if elements_at_root :
701
- # Pass the entire <thead> as a row. _parse_td() will interpret
702
- # it correctly.
703
761
rows .append (thead )
704
762
705
763
return rows
0 commit comments