diff --git a/codebeaver.yml b/codebeaver.yml new file mode 100644 index 0000000..a22a7b7 --- /dev/null +++ b/codebeaver.yml @@ -0,0 +1,7 @@ +workspaces: +- from: jest + name: scrapegraph-js + path: scrapegraph-js +- from: pytest + name: scrapegraph-py + path: scrapegraph-py diff --git a/scrapegraph-py/tests/test_localscraper.py b/scrapegraph-py/tests/test_localscraper.py new file mode 100644 index 0000000..ce54e5a --- /dev/null +++ b/scrapegraph-py/tests/test_localscraper.py @@ -0,0 +1,90 @@ +import pytest +from pydantic import BaseModel +from scrapegraph_py.models.localscraper import LocalScraperRequest, GetLocalScraperRequest + +# Create a dummy output schema to test the conversion in model_dump. +class DummySchema(BaseModel): + test_field: str + +def test_output_schema_conversion(): + """ + Test that when an output_schema is provided in a LocalScraperRequest, + model_dump returns a dictionary where the output_schema key holds the JSON schema + of the provided Pydantic model. + """ + user_prompt = "Extract company details" + website_html = "
Content
" + # Create a LocalScraperRequest with a dummy output_schema. + request = LocalScraperRequest(user_prompt=user_prompt, website_html=website_html, output_schema=DummySchema) + dumped = request.model_dump() + # Verify that output_schema is converted properly in the dumped dictionary. + assert "output_schema" in dumped + assert dumped["output_schema"] == DummySchema.model_json_schema() + +def test_invalid_website_html_structure(): + """ + Test that LocalScraperRequest raises a ValueError when the website_html provided + has no parseable HTML tags. This ensures the HTML content validation catches + non-HTML input. + """ + # This string has no HTML tags so BeautifulSoup.find() should return None. + invalid_html = "Just some random text" + with pytest.raises(ValueError, match="Invalid HTML - no parseable content found"): + LocalScraperRequest(user_prompt="Extract info about the company", website_html=invalid_html) + +def test_invalid_user_prompt_non_alnum(): + """ + Test that LocalScraperRequest raises a ValueError when the user_prompt + does not contain any alphanumeric characters. + """ + with pytest.raises(ValueError, match="User prompt must contain a valid prompt"): + LocalScraperRequest( + user_prompt="!!!", + website_html="
Valid Content
" + ) + +def test_get_localscraper_request_invalid_uuid(): + """ + Test that GetLocalScraperRequest raises a ValueError when an invalid UUID is provided. + This ensures that the model correctly validates the request_id as a proper UUID. + """ + invalid_uuid = "not-a-valid-uuid" + with pytest.raises(ValueError, match="request_id must be a valid UUID"): + GetLocalScraperRequest(request_id=invalid_uuid) + +def test_website_html_exceeds_maximum_size(): + """ + Test that LocalScraperRequest raises a ValueError when the website_html content + exceeds the maximum allowed size of 2MB. The generated HTML is valid but too large. + """ + # Calculate the number of characters needed to exceed 2MB when encoded in UTF-8. + max_size_bytes = 2 * 1024 * 1024 + # Create a valid HTML string that exceeds 2MB. + base_html_prefix = "" + base_html_suffix = "" + repeated_char_length = max_size_bytes - len(base_html_prefix.encode("utf-8")) - len(base_html_suffix.encode("utf-8")) + 1 + oversized_content = "a" * repeated_char_length + oversized_html = f"{base_html_prefix}{oversized_content}{base_html_suffix}" + + with pytest.raises(ValueError, match="Website HTML content exceeds maximum size of 2MB"): + LocalScraperRequest(user_prompt="Extract info", website_html=oversized_html) + +def test_website_html_exactly_maximum_size(): + """ + Test that LocalScraperRequest accepts website_html content exactly 2MB in size. + This ensures that the size validation correctly allows content on the boundary. + """ + user_prompt = "Extract info with exact size HTML" + prefix = "" + suffix = "" + # Calculate the length of the content needed to exactly reach 2MB when combined with prefix and suffix. + max_size_bytes = 2 * 1024 * 1024 + content_length = max_size_bytes - len(prefix.encode("utf-8")) - len(suffix.encode("utf-8")) + valid_content = "a" * content_length + html = prefix + valid_content + suffix + + # Attempt to create a valid LocalScraperRequest. + request = LocalScraperRequest(user_prompt=user_prompt, website_html=html) + + # Verify that the HTML content is exactly 2MB in size when encoded in UTF-8. + assert len(request.website_html.encode("utf-8")) == max_size_bytes diff --git a/scrapegraph-py/tests/test_markdownify.py b/scrapegraph-py/tests/test_markdownify.py new file mode 100644 index 0000000..80c6e32 --- /dev/null +++ b/scrapegraph-py/tests/test_markdownify.py @@ -0,0 +1,59 @@ +import pytest +from scrapegraph_py.models.markdownify import MarkdownifyRequest, GetMarkdownifyRequest + +def test_markdownify_request_invalid_url_scheme(): + """ + Test that MarkdownifyRequest raises a ValueError when the website_url does not + start with either 'http://' or 'https://'. + """ + with pytest.raises(ValueError, match="Invalid URL"): + MarkdownifyRequest(website_url="ftp://example.com") + +def test_markdownify_request_empty_url(): + """ + Test that MarkdownifyRequest raises a ValueError when the website_url is empty or contains only whitespace. + """ + with pytest.raises(ValueError, match="Website URL cannot be empty"): + MarkdownifyRequest(website_url=" ") + +def test_markdownify_request_valid_url(): + """ + Test that MarkdownifyRequest properly creates an instance when provided with a valid URL. + This covers the scenario where the input URL meets all validation requirements. + """ + valid_url = "https://example.com" + req = MarkdownifyRequest(website_url=valid_url) + assert req.website_url == valid_url + +def test_markdownify_request_untrimmed_url(): + """ + Test that MarkdownifyRequest raises a ValueError when the website_url contains leading or trailing whitespace. + Although the stripped URL would be valid, the actual value is not processed further, causing the check + for the proper URL scheme to fail. + """ + # The URL has leading whitespace, so it does not start directly with "https://" + with pytest.raises(ValueError, match="Invalid URL"): + MarkdownifyRequest(website_url=" https://example.com") + +def test_get_markdownify_request_invalid_uuid(): + """ + Test that GetMarkdownifyRequest raises a ValueError when the request_id is not a valid UUID. + """ + with pytest.raises(ValueError, match="request_id must be a valid UUID"): + GetMarkdownifyRequest(request_id="invalid_uuid") + +def test_get_markdownify_request_valid_uuid(): + """ + Test that GetMarkdownifyRequest properly creates an instance when provided with a valid UUID. + """ + valid_uuid = "123e4567-e89b-12d3-a456-426614174000" + req = GetMarkdownifyRequest(request_id=valid_uuid) + assert req.request_id == valid_uuid + +def test_get_markdownify_request_untrimmed_uuid(): + """ + Test that GetMarkdownifyRequest raises a ValueError when the request_id + contains leading or trailing whitespace, despite the trimmed UUID being valid. + """ + with pytest.raises(ValueError, match="request_id must be a valid UUID"): + GetMarkdownifyRequest(request_id=" 123e4567-e89b-12d3-a456-426614174000 ") diff --git a/scrapegraph-py/tests/test_smartscraper.py b/scrapegraph-py/tests/test_smartscraper.py new file mode 100644 index 0000000..3d8b598 --- /dev/null +++ b/scrapegraph-py/tests/test_smartscraper.py @@ -0,0 +1,78 @@ +import pytest +from pydantic import BaseModel, ValidationError +from scrapegraph_py.models.smartscraper import SmartScraperRequest, GetSmartScraperRequest + +# Define a dummy schema to test the output_schema conversion in model_dump +class DummySchema(BaseModel): + """A dummy schema to simulate a Pydantic model with JSON schema conversion.""" + a: int = 1 + +def test_model_dump_with_output_schema_conversion(): + """ + Test that model_dump on SmartScraperRequest converts the provided output_schema into a JSON schema dict. + """ + # Create a request with a valid user prompt, website URL, and a dummy output_schema. + request = SmartScraperRequest( + user_prompt="Extract information about the company", + website_url="https://scrapegraphai.com/", + output_schema=DummySchema + ) + # Get the dump dict from the model. + output = request.model_dump() + # The model_dump should include the 'output_schema' converted to its JSON schema representation. + expected_schema = DummySchema.model_json_schema() + assert output.get("output_schema") == expected_schema + +def test_model_dump_without_output_schema(): + """ + Test that model_dump on SmartScraperRequest returns output_schema as None + when no output_schema is provided. This ensures that the conversion logic is only + applied when output_schema is not None. + """ + # Create a valid SmartScraperRequest without providing an output_schema. + request = SmartScraperRequest( + user_prompt="Extract some meaningful data", + website_url="https://scrapegraphai.com/" + ) + # Get the dumped dictionary from the model. + output = request.model_dump() + # Ensure that the output contains the key "output_schema" and its value is None. + assert "output_schema" in output, "Output schema key should be present even if None" + assert output["output_schema"] is None, "Output schema should be None when not provided" + +def test_invalid_get_smartscraper_request_id(): + """ + Test that GetSmartScraperRequest raises a ValueError when provided with an invalid UUID. + This test ensures that the request_id field is validated correctly. + """ + with pytest.raises(ValueError, match="request_id must be a valid UUID"): + GetSmartScraperRequest(request_id="invalid-uuid") + +def test_invalid_url_in_smartscraper_request(): + """ + Test that SmartScraperRequest raises a ValueError when provided with a website_url + that does not start with 'http://' or 'https://'. This ensures the URL validation works. + """ + with pytest.raises(ValueError, match="Invalid URL"): + SmartScraperRequest( + user_prompt="Extract data", + website_url="ftp://invalid-url" + ) + +def test_invalid_user_prompt_empty_and_non_alnum(): + """ + Test that SmartScraperRequest raises a ValueError when the user_prompt is either empty (or only whitespace) + or when it contains no alphanumeric characters. This ensures the user prompt validator is working correctly. + """ + # Test with a user_prompt that is empty (only whitespace) + with pytest.raises(ValueError, match="User prompt cannot be empty"): + SmartScraperRequest( + user_prompt=" ", + website_url="https://scrapegraphai.com/" + ) + # Test with a user_prompt that contains no alphanumeric characters + with pytest.raises(ValueError, match="User prompt must contain a valid prompt"): + SmartScraperRequest( + user_prompt="!!!", + website_url="https://scrapegraphai.com/" + )