From d64a0476c0d39ce173da97ffd4417484a413bf29 Mon Sep 17 00:00:00 2001 From: Gabi Davar Date: Fri, 16 Aug 2013 15:17:21 +0300 Subject: [PATCH 1/3] win32 paths cannot be turned into URLs by prefixing them with "file://" see http://stackoverflow.com/questions/11687478/convert-a-filename-to-a-file-url --- pandas/io/tests/test_html.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py index a83d85b89846e..ea7c1aa47171b 100644 --- a/pandas/io/tests/test_html.py +++ b/pandas/io/tests/test_html.py @@ -27,6 +27,12 @@ from pandas.util.testing import makeCustomDataframe as mkdf +import urlparse, urllib + +def path2url(path): + return urlparse.urljoin( + 'file:', urllib.pathname2url(path)) + def _have_module(module_name): try: import_module(module_name) @@ -292,7 +298,7 @@ def test_bad_url_protocol(self): @slow def test_file_url(self): url = self.banklist_data - dfs = self.run_read_html('file://' + url, 'First', + dfs = self.run_read_html(path2url(url), 'First', attrs={'id': 'table'}) self.assertIsInstance(dfs, list) for df in dfs: @@ -338,7 +344,7 @@ def test_multiindex_header_index_skiprows(self): @slow def test_regex_idempotency(self): url = self.banklist_data - dfs = self.run_read_html('file://' + url, + dfs = self.run_read_html(path2url(url), match=re.compile(re.compile('Florida')), attrs={'id': 'table'}) self.assertIsInstance(dfs, list) @@ -464,7 +470,7 @@ def test_invalid_flavor(): def get_elements_from_url(url, element='table', base_url="file://"): _skip_if_none_of(('bs4', 'html5lib')) - url = "".join([base_url, url]) + url = path2url(url) if base_url == "file://" else "".join([base_url, url]) from bs4 import BeautifulSoup with urlopen(url) as f: soup = BeautifulSoup(f, features='html5lib') From ebc23c816cc84a7f62c077a7c40a455a0bcdbe71 Mon Sep 17 00:00:00 2001 From: Gabi Davar Date: Sat, 17 Aug 2013 12:53:07 +0300 Subject: [PATCH 2/3] make fix less patchy --- doc/source/release.rst | 2 ++ pandas/io/common.py | 17 +++++++++++++++++ pandas/io/tests/test_html.py | 10 ++++------ 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index cd8a62664fac1..e586f29816aac 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -39,6 +39,8 @@ pandas 0.13 **Bug Fixes** + - Fixed html tests on win32. (:issue:`4580`) + pandas 0.12 =========== diff --git a/pandas/io/common.py b/pandas/io/common.py index 33958ade2bcd6..e447c2a15d802 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -3,6 +3,7 @@ import sys import urlparse import urllib2 +import urllib import zipfile from contextlib import contextmanager, closing from StringIO import StringIO @@ -91,6 +92,22 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None): return filepath_or_buffer, None +def path_to_url(path): + """ + converts an absolute native path to a FILE URL. + + Parameters + ---------- + path : a path in native format + + Returns + ------- + a valid FILE URL + """ + return urlparse.urljoin( + 'file:', urllib.pathname2url(path)) + + # ---------------------- # Prevent double closing if py3compat.PY3: diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py index ea7c1aa47171b..61a7159c9468e 100644 --- a/pandas/io/tests/test_html.py +++ b/pandas/io/tests/test_html.py @@ -19,6 +19,7 @@ from pandas.io.html import read_html from pandas.io.common import urlopen +from pandas.io.common import path_to_url from pandas import DataFrame, MultiIndex, read_csv, Timestamp from pandas.util.testing import (assert_frame_equal, network, @@ -26,9 +27,6 @@ from pandas.util.testing import makeCustomDataframe as mkdf - -import urlparse, urllib - def path2url(path): return urlparse.urljoin( 'file:', urllib.pathname2url(path)) @@ -298,7 +296,7 @@ def test_bad_url_protocol(self): @slow def test_file_url(self): url = self.banklist_data - dfs = self.run_read_html(path2url(url), 'First', + dfs = self.run_read_html(path_to_url(url), 'First', attrs={'id': 'table'}) self.assertIsInstance(dfs, list) for df in dfs: @@ -344,7 +342,7 @@ def test_multiindex_header_index_skiprows(self): @slow def test_regex_idempotency(self): url = self.banklist_data - dfs = self.run_read_html(path2url(url), + dfs = self.run_read_html(path_to_url(url), match=re.compile(re.compile('Florida')), attrs={'id': 'table'}) self.assertIsInstance(dfs, list) @@ -470,7 +468,7 @@ def test_invalid_flavor(): def get_elements_from_url(url, element='table', base_url="file://"): _skip_if_none_of(('bs4', 'html5lib')) - url = path2url(url) if base_url == "file://" else "".join([base_url, url]) + url = path_to_url(url) if base_url == "file://" else "".join([base_url, url]) from bs4 import BeautifulSoup with urlopen(url) as f: soup = BeautifulSoup(f, features='html5lib') From 3236647c14047a7c71d30e6c2ae912787bd8b6f6 Mon Sep 17 00:00:00 2001 From: Gabi Davar Date: Fri, 23 Aug 2013 11:44:08 +0300 Subject: [PATCH 3/3] make path_to_url python3 friendly --- pandas/io/common.py | 23 ++++++++++++++++------- pandas/io/tests/test_html.py | 18 ++++++++++-------- 2 files changed, 26 insertions(+), 15 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index e447c2a15d802..bcc447a88e04d 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -1,15 +1,25 @@ """Common IO api utilities""" import sys -import urlparse import urllib2 -import urllib import zipfile from contextlib import contextmanager, closing -from StringIO import StringIO + from pandas.util import py3compat +if py3compat.PY3: # pragma: no cover + import urllib.parse as urlparse + from urllib.parse import urljoin + from urllib.request import pathname2url + from io import StringIO +else: + import urlparse + from urlparse import urljoin + from urllib import pathname2url + from StringIO import StringIO + + _VALID_URLS = set(urlparse.uses_relative + urlparse.uses_netloc + urlparse.uses_params) _VALID_URLS.discard('') @@ -69,8 +79,8 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None): else: errors = 'replace' encoding = 'utf-8' - bytes = filepath_or_buffer.read().decode(encoding, errors) - filepath_or_buffer = StringIO(bytes) + raw_bytes = filepath_or_buffer.read().decode(encoding, errors) + filepath_or_buffer = StringIO(raw_bytes) return filepath_or_buffer, encoding return filepath_or_buffer, None @@ -104,8 +114,7 @@ def path_to_url(path): ------- a valid FILE URL """ - return urlparse.urljoin( - 'file:', urllib.pathname2url(path)) + return urljoin('file:', pathname2url(path)) # ---------------------- diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py index 61a7159c9468e..fa905c0154d64 100644 --- a/pandas/io/tests/test_html.py +++ b/pandas/io/tests/test_html.py @@ -1,10 +1,16 @@ import os import re -from cStringIO import StringIO from unittest import TestCase import warnings from distutils.version import LooseVersion +from pandas.util import py3compat + +if py3compat.PY3: + from io import StringIO +else: + from cStringIO import StringIO + import nose from nose.tools import assert_raises @@ -27,10 +33,6 @@ from pandas.util.testing import makeCustomDataframe as mkdf -def path2url(path): - return urlparse.urljoin( - 'file:', urllib.pathname2url(path)) - def _have_module(module_name): try: import_module(module_name) @@ -466,9 +468,9 @@ def test_invalid_flavor(): flavor='not a* valid**++ flaver') -def get_elements_from_url(url, element='table', base_url="file://"): +def get_elements_from_file(url, element='table'): _skip_if_none_of(('bs4', 'html5lib')) - url = path_to_url(url) if base_url == "file://" else "".join([base_url, url]) + url = path_to_url(url) from bs4 import BeautifulSoup with urlopen(url) as f: soup = BeautifulSoup(f, features='html5lib') @@ -480,7 +482,7 @@ def test_bs4_finds_tables(): filepath = os.path.join(DATA_PATH, "spam.html") with warnings.catch_warnings(): warnings.filterwarnings('ignore') - assert get_elements_from_url(filepath, 'table') + assert get_elements_from_file(filepath, 'table') def get_lxml_elements(url, element):