From a7e8e307354cc9a58e355bd8ec4bda3245017906 Mon Sep 17 00:00:00 2001 From: AnnaMarika01 Date: Tue, 24 Sep 2024 12:33:51 +0200 Subject: [PATCH 1/5] Added 'level' attribute to the model and created tests to verify its validity --- parse_document_model/attributes.py | 11 +++++++++- test/test_validation.py | 34 ++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/parse_document_model/attributes.py b/parse_document_model/attributes.py index 79244b7..d0a26a2 100644 --- a/parse_document_model/attributes.py +++ b/parse_document_model/attributes.py @@ -1,6 +1,7 @@ from abc import ABC +from typing import Optional -from pydantic import BaseModel +from pydantic import BaseModel, Field, validator, field_validator class BoundingBox(BaseModel): @@ -25,3 +26,11 @@ class PageAttributes(Attributes): class TextAttributes(Attributes): bounding_box: list[BoundingBox] = [] + level: Optional[int] = Field(None, gw=1, le=4) + + @field_validator('level') + @classmethod + def check_level(cls, v) -> int: + if v is not None and v not in range(1, 5): + raise ValueError("Level must be between 1 and 4 or None") + return v \ No newline at end of file diff --git a/test/test_validation.py b/test/test_validation.py index 339764f..f05ebb9 100644 --- a/test/test_validation.py +++ b/test/test_validation.py @@ -1,6 +1,7 @@ import json import pytest +import pydantic from parse_document_model import Document, Page from parse_document_model.attributes import PageAttributes, TextAttributes @@ -66,3 +67,36 @@ def test_url_marks(): else: with pytest.raises(ValueError): UrlMark(**mark_json) + + +def test_text_attributes_level(): + valid_text_attributes = [ + {"bounding_box": [], "level": 1}, + {"bounding_box": [], "level": 2}, + {"bounding_box": [], "level": 3}, + {"bounding_box": [], "level": 4}, + {"bounding_box": [], "level": None}, + {"bounding_box": []}, + {} + ] + + for attributes_json in valid_text_attributes: + text_attributes = TextAttributes(**attributes_json) + assert isinstance(text_attributes, TextAttributes) + assert isinstance(text_attributes.level, (int, type(None))) + if text_attributes.level is not None: + assert text_attributes.level in range(1, 5) + assert attributes_json["level"] == text_attributes.level + else: + assert "level" not in attributes_json or attributes_json["level"] is None + + invalid_text_attributes = [ + {"bounding_box": [], "level": -1}, + {"bounding_box": [], "level": "invalid"}, + {"bounding_box": [], "level": 2.5}, + {"bounding_box": [], "level": 5}, + ] + + for attributes_json in invalid_text_attributes: + with pytest.raises(ValueError): + TextAttributes(**attributes_json) From 5d6d8910100a48479024362609abf4d376e3fd8b Mon Sep 17 00:00:00 2001 From: AnnaMarika01 Date: Tue, 24 Sep 2024 13:37:31 +0200 Subject: [PATCH 2/5] fixed import --- parse_document_model/attributes.py | 2 +- test/test_validation.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/parse_document_model/attributes.py b/parse_document_model/attributes.py index d0a26a2..3b91de1 100644 --- a/parse_document_model/attributes.py +++ b/parse_document_model/attributes.py @@ -1,7 +1,7 @@ from abc import ABC from typing import Optional -from pydantic import BaseModel, Field, validator, field_validator +from pydantic import BaseModel, Field, field_validator class BoundingBox(BaseModel): diff --git a/test/test_validation.py b/test/test_validation.py index f05ebb9..e564f9d 100644 --- a/test/test_validation.py +++ b/test/test_validation.py @@ -1,7 +1,6 @@ import json import pytest -import pydantic from parse_document_model import Document, Page from parse_document_model.attributes import PageAttributes, TextAttributes From 1084722dbe337cf0746e7b15a11d6722e62416ea Mon Sep 17 00:00:00 2001 From: Andrea Ponti Date: Tue, 24 Sep 2024 14:49:12 +0200 Subject: [PATCH 3/5] Add new line --- parse_document_model/attributes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parse_document_model/attributes.py b/parse_document_model/attributes.py index 3b91de1..3fbfa16 100644 --- a/parse_document_model/attributes.py +++ b/parse_document_model/attributes.py @@ -33,4 +33,4 @@ class TextAttributes(Attributes): def check_level(cls, v) -> int: if v is not None and v not in range(1, 5): raise ValueError("Level must be between 1 and 4 or None") - return v \ No newline at end of file + return v From 6e5293fb9ea67d5cc018b8b72f78c60eab871c22 Mon Sep 17 00:00:00 2001 From: AnnaMarika01 Date: Tue, 24 Sep 2024 15:11:12 +0200 Subject: [PATCH 4/5] Fix a typo and update README.md --- README.md | 32 +++++++++++++++++++++++++++--- parse_document_model/attributes.py | 9 +-------- 2 files changed, 30 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 7b94c9a..232ab2c 100644 --- a/README.md +++ b/README.md @@ -92,12 +92,37 @@ Represents a page in the document: This node represent a paragraph, a heading or any text within the document. -- `category`: The type `"doc"`. +- `category`: The classification of the text within the document. - `content`: A string representing the textual content. - `marks`: List of [marks](#marks) applied to the text, such as bold, italic, etc. - `attributes`: Can contain metadata like the bounding box representing where this portion of text is located in the page. - +### Category +Below are the various categories of text that may be found within a document: + +**Category Type** +- `page-header`: Represents the header of the page. +- `footer`: Represents the footer of the page. +- `heading`: Any heading within the document. +- `figure`: Represents a figure or an image. +- `other`: Any other unclassified text. +- `appendix`: Text within an appendix. +- `keywords`: List of keywords. +- `acknowledgments`: Section acknowledging contributors. +- `caption`: Caption associated with a figure or table. +- `toc`: Table of contents. +- `abstract`: The abstract of the document. +- `footnote`: Text at the bottom of the page providing additional information. +- `body`: Main body text of the document. +- `itemize-item`: Item in a list or bullet point. +- `title`: The title of the document. +- `reference`: References or citations within the document. +- `affiliation`: Author's institutional affiliation. +- `general-terms`: General terms section. +- `formula`: Mathematical formula or equation. +- `categories`: Categories or topics listed in the document. +- `table`: Represents a table. +- `authors`: List of authors. ### Marks @@ -119,8 +144,9 @@ Attributes are optional fields that can store additional information for each no - `DocumentAttributes`: General attributes for the document (currently reserved for the future). - `PageAttributes`: Specific page related attributes, such as the page number. -- `TextAttributes`: Text related attributes, such as bounding boxes. +- `TextAttributes`: Text related attributes, such as bounding boxes or level. - `BoundingBox`: A box that specifies the position of a text in the page. +- `Level`: The specific level of the text within a document, for example, for headings. ## Getting started diff --git a/parse_document_model/attributes.py b/parse_document_model/attributes.py index 3b91de1..7724bb1 100644 --- a/parse_document_model/attributes.py +++ b/parse_document_model/attributes.py @@ -26,11 +26,4 @@ class PageAttributes(Attributes): class TextAttributes(Attributes): bounding_box: list[BoundingBox] = [] - level: Optional[int] = Field(None, gw=1, le=4) - - @field_validator('level') - @classmethod - def check_level(cls, v) -> int: - if v is not None and v not in range(1, 5): - raise ValueError("Level must be between 1 and 4 or None") - return v \ No newline at end of file + level: Optional[int] = Field(None, ge=1, le=4) From 33094a4e2862b1bce4239e215e48612942080c59 Mon Sep 17 00:00:00 2001 From: Andrea Ponti Date: Tue, 24 Sep 2024 15:27:06 +0200 Subject: [PATCH 5/5] Remove unused import --- parse_document_model/attributes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parse_document_model/attributes.py b/parse_document_model/attributes.py index 7724bb1..b04128d 100644 --- a/parse_document_model/attributes.py +++ b/parse_document_model/attributes.py @@ -1,7 +1,7 @@ from abc import ABC from typing import Optional -from pydantic import BaseModel, Field, field_validator +from pydantic import BaseModel, Field class BoundingBox(BaseModel):