diff --git a/README.md b/README.md index 7b94c9a..232ab2c 100644 --- a/README.md +++ b/README.md @@ -92,12 +92,37 @@ Represents a page in the document: This node represent a paragraph, a heading or any text within the document. -- `category`: The type `"doc"`. +- `category`: The classification of the text within the document. - `content`: A string representing the textual content. - `marks`: List of [marks](#marks) applied to the text, such as bold, italic, etc. - `attributes`: Can contain metadata like the bounding box representing where this portion of text is located in the page. - +### Category +Below are the various categories of text that may be found within a document: + +**Category Type** +- `page-header`: Represents the header of the page. +- `footer`: Represents the footer of the page. +- `heading`: Any heading within the document. +- `figure`: Represents a figure or an image. +- `other`: Any other unclassified text. +- `appendix`: Text within an appendix. +- `keywords`: List of keywords. +- `acknowledgments`: Section acknowledging contributors. +- `caption`: Caption associated with a figure or table. +- `toc`: Table of contents. +- `abstract`: The abstract of the document. +- `footnote`: Text at the bottom of the page providing additional information. +- `body`: Main body text of the document. +- `itemize-item`: Item in a list or bullet point. +- `title`: The title of the document. +- `reference`: References or citations within the document. +- `affiliation`: Author's institutional affiliation. +- `general-terms`: General terms section. +- `formula`: Mathematical formula or equation. +- `categories`: Categories or topics listed in the document. +- `table`: Represents a table. +- `authors`: List of authors. ### Marks @@ -119,8 +144,9 @@ Attributes are optional fields that can store additional information for each no - `DocumentAttributes`: General attributes for the document (currently reserved for the future). - `PageAttributes`: Specific page related attributes, such as the page number. -- `TextAttributes`: Text related attributes, such as bounding boxes. +- `TextAttributes`: Text related attributes, such as bounding boxes or level. - `BoundingBox`: A box that specifies the position of a text in the page. +- `Level`: The specific level of the text within a document, for example, for headings. ## Getting started diff --git a/parse_document_model/attributes.py b/parse_document_model/attributes.py index 79244b7..b04128d 100644 --- a/parse_document_model/attributes.py +++ b/parse_document_model/attributes.py @@ -1,6 +1,7 @@ from abc import ABC +from typing import Optional -from pydantic import BaseModel +from pydantic import BaseModel, Field class BoundingBox(BaseModel): @@ -25,3 +26,4 @@ class PageAttributes(Attributes): class TextAttributes(Attributes): bounding_box: list[BoundingBox] = [] + level: Optional[int] = Field(None, ge=1, le=4) diff --git a/test/test_validation.py b/test/test_validation.py index 339764f..e564f9d 100644 --- a/test/test_validation.py +++ b/test/test_validation.py @@ -66,3 +66,36 @@ def test_url_marks(): else: with pytest.raises(ValueError): UrlMark(**mark_json) + + +def test_text_attributes_level(): + valid_text_attributes = [ + {"bounding_box": [], "level": 1}, + {"bounding_box": [], "level": 2}, + {"bounding_box": [], "level": 3}, + {"bounding_box": [], "level": 4}, + {"bounding_box": [], "level": None}, + {"bounding_box": []}, + {} + ] + + for attributes_json in valid_text_attributes: + text_attributes = TextAttributes(**attributes_json) + assert isinstance(text_attributes, TextAttributes) + assert isinstance(text_attributes.level, (int, type(None))) + if text_attributes.level is not None: + assert text_attributes.level in range(1, 5) + assert attributes_json["level"] == text_attributes.level + else: + assert "level" not in attributes_json or attributes_json["level"] is None + + invalid_text_attributes = [ + {"bounding_box": [], "level": -1}, + {"bounding_box": [], "level": "invalid"}, + {"bounding_box": [], "level": 2.5}, + {"bounding_box": [], "level": 5}, + ] + + for attributes_json in invalid_text_attributes: + with pytest.raises(ValueError): + TextAttributes(**attributes_json)