From d0bdcea470601567ff031d451fe9da6706de4230 Mon Sep 17 00:00:00 2001 From: Andrea Ponti Date: Mon, 16 Sep 2024 09:46:12 +0200 Subject: [PATCH 01/14] Add base models --- README.md | 2 +- .../__init__.py | 0 document_model_python/document.py | 34 ++++++++++++++ document_model_python/marks.py | 47 +++++++++++++++++++ requirements.txt | 1 + 5 files changed, 83 insertions(+), 1 deletion(-) rename {document-model-python => document_model_python}/__init__.py (100%) create mode 100644 document_model_python/document.py create mode 100644 document_model_python/marks.py create mode 100644 requirements.txt diff --git a/README.md b/README.md index b363d42..88f9ad6 100644 --- a/README.md +++ b/README.md @@ -1 +1 @@ -# :card_box: Document Model Python +# :card_file_box: Document Model Python diff --git a/document-model-python/__init__.py b/document_model_python/__init__.py similarity index 100% rename from document-model-python/__init__.py rename to document_model_python/__init__.py diff --git a/document_model_python/document.py b/document_model_python/document.py new file mode 100644 index 0000000..ae78342 --- /dev/null +++ b/document_model_python/document.py @@ -0,0 +1,34 @@ +from typing import List, Union, TypedDict + +from pydantic import BaseModel, Field + +from document_model_python.marks import Mark, TextStyleMark + + +class BoundingBox(TypedDict): + min_x: float + min_y: float + max_x: float + max_y: float + page: int + + +class ContentAttributes(BaseModel): + bounding_box: List[BoundingBox] = [] + + +class Content(BaseModel): + role: str + text: str + marks: List[Union[Mark, TextStyleMark]] = [] + attributes: ContentAttributes = ContentAttributes() + + +class NodeAttributes(BaseModel): + page: int + + +class Node(BaseModel): + category: str + attributes: NodeAttributes + content: List[Content] diff --git a/document_model_python/marks.py b/document_model_python/marks.py new file mode 100644 index 0000000..1704101 --- /dev/null +++ b/document_model_python/marks.py @@ -0,0 +1,47 @@ +from typing import Any +from typing import Literal, Optional + +from pydantic import BaseModel, model_validator + + +class Color(BaseModel): + id: str + r: int + g: int + b: int + + +class Font(BaseModel): + id: str + name: str + size: int + + +class Mark(BaseModel): + category: Literal['bold', 'italic', 'textStyle', 'link'] + + @model_validator(mode='before') + def check_details(self: Any) -> Any: + mark_type = self.get('type') + + if mark_type == 'textStyle': + if 'color' not in self and 'font' not in self: + raise ValueError('color or font must be provided when type is textStyle') + if 'url' in self: + raise ValueError('url should not be provided when type is textStyle') + + elif mark_type == 'link': + if 'url' not in self: + raise ValueError('url must be provided when type is link') + if 'textStyle' in self: + raise ValueError('textStyle should not be provided when type is link') + return self + + +class TextStyleMark(Mark): + color: Optional[Color] = None + font: Optional[Font] = None + + +class UrlMark(Mark): + url: str diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..f22a600 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +pydantic~=2.9.1 From 6e4a42cca662c54327a086902de21908194123ba Mon Sep 17 00:00:00 2001 From: Andrea Ponti Date: Mon, 16 Sep 2024 10:30:06 +0200 Subject: [PATCH 02/14] Improve models structure --- document_model_python/attributes.py | 23 +++++++++++ document_model_python/document.py | 62 +++++++++++++++++++---------- 2 files changed, 64 insertions(+), 21 deletions(-) create mode 100644 document_model_python/attributes.py diff --git a/document_model_python/attributes.py b/document_model_python/attributes.py new file mode 100644 index 0000000..44c39ad --- /dev/null +++ b/document_model_python/attributes.py @@ -0,0 +1,23 @@ +from abc import ABC + +from pydantic import BaseModel + + +class BoundingBox(BaseModel): + min_x: float + min_y: float + max_x: float + max_y: float + page: int + + +class Attributes(BaseModel, ABC): + pass + + +class PageAttributes(Attributes): + page: int + + +class LeafAttributes(Attributes): + bounding_box: list[BoundingBox] = [] diff --git a/document_model_python/document.py b/document_model_python/document.py index ae78342..e0ce380 100644 --- a/document_model_python/document.py +++ b/document_model_python/document.py @@ -1,34 +1,54 @@ -from typing import List, Union, TypedDict +from abc import ABC +from typing import List, Any, Optional from pydantic import BaseModel, Field -from document_model_python.marks import Mark, TextStyleMark +from document_model_python.attributes import Attributes, PageAttributes, LeafAttributes +from document_model_python.marks import Mark -class BoundingBox(TypedDict): - min_x: float - min_y: float - max_x: float - max_y: float - page: int +class Node(BaseModel, ABC): + """Base element of a document. + A document is a hierarchy of nodes. + Nodes could represent: document, pages, headings, etc. + """ + category: str = Field( + title="Node Type", + description="The type of node. Examples are: `doc`, `page`, `heading`, `body`, etc. For an exhaustive list " + "refers to the documentation.", + ) + attributes: Optional[Attributes] = Field( + title="Node Attributes", + description="Attributes related to the node. An example is the reference page." + ) + content: Any = Field( + title="Node Content", + description="The content of the node. If it is a leaf node this is text, otherwise it could be a list of " + "nodes.", + ) -class ContentAttributes(BaseModel): - bounding_box: List[BoundingBox] = [] +class Leaf(Node): + """The leaf node of a document. -class Content(BaseModel): - role: str - text: str - marks: List[Union[Mark, TextStyleMark]] = [] - attributes: ContentAttributes = ContentAttributes() + That's where the actual text is. + """ + attributes: LeafAttributes = LeafAttributes() + content: str + marks: list[Mark] = [] -class NodeAttributes(BaseModel): - page: int +class Page(Node): + """The node that represents a document's page.""" + category: str = "page" + attributes: PageAttributes + content = list[Leaf] -class Node(BaseModel): - category: str - attributes: NodeAttributes - content: List[Content] + +class Document(Node): + """The root node of a document.""" + category: str = "doc" + attributes: Optional[Attributes] = None + content: list[Page] From 7ee4e1823408acb160d513c53506547dc5e96798 Mon Sep 17 00:00:00 2001 From: Andrea Ponti Date: Mon, 16 Sep 2024 11:18:52 +0200 Subject: [PATCH 03/14] Add setup.py and requirements-dev.txt --- requirements-dev.txt | 3 +++ setup.py | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) create mode 100644 requirements-dev.txt create mode 100644 setup.py diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..c12813c --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,3 @@ +setuptools~=75.0.0 +twine~=5.1.1 +wheel~=0.44.0 diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..124c053 --- /dev/null +++ b/setup.py @@ -0,0 +1,33 @@ +from codecs import open +from os import path + +from setuptools import setup + +ROOT = path.abspath(path.dirname(__file__)) + +with open(path.join(ROOT, 'README.md'), encoding='utf-8') as f: + long_description = f.read() + +setup( + name='document-model-python', + version='0.1.0', + description='Define the pydantic models for a text document.', + long_description=long_description, + long_description_content_type='text/markdown', + author='OneOffTech', + author_email='info@oneofftech.de', + license='MIT', + url='https://github.com/OneOffTech/document-model-python', + project_urls={ + 'Source': 'https://github.com/OneOffTech/document-model-python', + }, + classifiers=[ + 'Intended Audience :: Developers', + 'License :: OSI Approved :: MIT License', + 'Programming Language :: Python', + 'Operating System :: OS Independent' + ], + packages=['document_model_python'], + include_package_data=True, + install_requires=['pydantic'] +) From 05e97612b3f30cb196fbdc7e0e87a90eef7e8b9f Mon Sep 17 00:00:00 2001 From: Andrea Ponti Date: Mon, 16 Sep 2024 12:42:58 +0200 Subject: [PATCH 04/14] Add retro-compatibility with text field with deprecation warning --- document_model_python/attributes.py | 6 +++- document_model_python/document.py | 45 ++++++++++++++++++++++------- 2 files changed, 40 insertions(+), 11 deletions(-) diff --git a/document_model_python/attributes.py b/document_model_python/attributes.py index 44c39ad..79244b7 100644 --- a/document_model_python/attributes.py +++ b/document_model_python/attributes.py @@ -15,9 +15,13 @@ class Attributes(BaseModel, ABC): pass +class DocumentAttributes(Attributes): + pass + + class PageAttributes(Attributes): page: int -class LeafAttributes(Attributes): +class TextAttributes(Attributes): bounding_box: list[BoundingBox] = [] diff --git a/document_model_python/document.py b/document_model_python/document.py index e0ce380..15b9bb0 100644 --- a/document_model_python/document.py +++ b/document_model_python/document.py @@ -1,9 +1,10 @@ +import warnings from abc import ABC -from typing import List, Any, Optional +from typing import List, Optional -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, model_validator -from document_model_python.attributes import Attributes, PageAttributes, LeafAttributes +from document_model_python.attributes import Attributes, PageAttributes, TextAttributes, DocumentAttributes from document_model_python.marks import Mark @@ -14,41 +15,65 @@ class Node(BaseModel, ABC): Nodes could represent: document, pages, headings, etc. """ category: str = Field( + ..., title="Node Type", description="The type of node. Examples are: `doc`, `page`, `heading`, `body`, etc. For an exhaustive list " "refers to the documentation.", ) attributes: Optional[Attributes] = Field( + default=None, title="Node Attributes", description="Attributes related to the node. An example is the reference page." ) - content: Any = Field( + + +class StructuredNode(Node): + content: List[Node] = Field( + ..., title="Node Content", description="The content of the node. If it is a leaf node this is text, otherwise it could be a list of " "nodes.", ) -class Leaf(Node): +class Text(Node): """The leaf node of a document. That's where the actual text is. """ - attributes: LeafAttributes = LeafAttributes() - content: str + attributes: Optional[TextAttributes] = TextAttributes() + content: str = Field( + ..., + title="Content", + description="The new field to hold the text content." + ) + text: Optional[str] = Field( + None, + title="Text", + description="(Deprecated) This field is deprecated and will be removed in a future version. " + "Use `content` instead." + ) marks: list[Mark] = [] + @model_validator(mode="before") + def handle_deprecated_text(self): + if "text" in self and "content" not in self: + warnings.warn("The use of `text` is deprecated and will be removed in a future version. " + "Use `content` instead.", DeprecationWarning) + self["content"] = self["text"] + return self + class Page(Node): """The node that represents a document's page.""" category: str = "page" - attributes: PageAttributes - content = list[Leaf] + attributes: Optional[PageAttributes] = None + content = list[Text] class Document(Node): """The root node of a document.""" category: str = "doc" - attributes: Optional[Attributes] = None + attributes: Optional[DocumentAttributes] = None content: list[Page] From c1c5578d5d1bebc3baf9c1cc2878307623adf746 Mon Sep 17 00:00:00 2001 From: Andrea Ponti Date: Mon, 16 Sep 2024 12:43:10 +0200 Subject: [PATCH 05/14] Update LICENSE --- LICENSE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE b/LICENSE index ff1f271..5d0b89e 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2024 Andrea Ponti +Copyright (c) 2024 OneOffTech Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal From 46787598b03157f4a9419fa635994a1730de5793 Mon Sep 17 00:00:00 2001 From: Andrea Ponti Date: Mon, 16 Sep 2024 12:45:19 +0200 Subject: [PATCH 06/14] Update requirements in setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 124c053..2ac72eb 100644 --- a/setup.py +++ b/setup.py @@ -29,5 +29,5 @@ ], packages=['document_model_python'], include_package_data=True, - install_requires=['pydantic'] + install_requires=['pydantic>=2.9.0'] ) From c65ba59b4c9edc300c21a207f71201265c1f9ccd Mon Sep 17 00:00:00 2001 From: Andrea Ponti Date: Mon, 16 Sep 2024 13:07:23 +0200 Subject: [PATCH 07/14] Update README.md --- README.md | 124 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 124 insertions(+) diff --git a/README.md b/README.md index 88f9ad6..7408cf6 100644 --- a/README.md +++ b/README.md @@ -1 +1,125 @@ +![pypi](https://img.shields.io/pypi/v/document-model-python.svg) +[![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://docs.pydantic.dev/latest/contributing/#badges) +[![License](https://img.shields.io/badge/license-MIT-green.svg)](LICENSE) + # :card_file_box: Document Model Python + +**Document Model Python** is a library for representing text documents using a hierarchical model. +This library allows you to define documents as a collection of nodes where each node can represent a document, page, +text, heading, body, and more. + +## 🌟 Key Features + +- **Hierarchical Structure**: The document is modeled as a hierarchy of nodes. Each node could represent a part of the +document such as a document itself, pages, text. +- **Rich Text Support**: Nodes can represent not only the content but also the marks (e.g., bold, italic) applied to +the text. +- **Attributes**: Every node can have attributes that provide additional information such as page number, +bounding box, etc. +- **Easy-to-use**: Built with [`Pydantic`](https://docs.pydantic.dev/latest/), ensures type validation and effortless +creation of complex document structures. + +## πŸ“š Structure Overview + +### 1. **Node** (Base Class) + +The base element of the document is a `Node`. This is the abstract class from which all other nodes inherit. +Each node has: +- `category`: The type of the node (e.g., `doc`, `page`, `heading`). +- `attributes`: Optional field to attach extra data to a node. + +### 2. **StructuredNode** + +This extends the `Node` class and can contain other nodes as content. It is used for non-leaf nodes like +`Document` and `Page`. + +### 3. **Text** + +This is a leaf node and contains the actual text content: + +- `content`: The main text content. +- `text`: Deprecated field, use `content` instead. +- `marks`: List of text marks like bold, italic, text style, etc. + +### 4. **Page** + +Represents a page in the document: + +- `category`: Always set to `"page"`. +- `attributes`: Can contain metadata like page number. +- `content`: List of `Text` nodes on the page. + +### 5. **Document** + +This is the root node of the document: + +- `category`: Always set to `"doc"`. +- `attributes`: Document-wide attributes can be set here. +- `content`: List of `Page` nodes that form the document. + +## πŸ–‹οΈ Marks + +Marks are used to style or add functionality to the text inside a `Text` node. +For example, bold text, italic text, links, and custom styles like font or color. + +### **Mark Types** + +- **Bold**: Represents bold text. +- **Italic**: Represents italic text. +- **TextStyle**: Allows customization of font and color. +- **Link**: Represents a hyperlink. + +Marks are validated and enforced with the help of `Pydantic` model validators. + +## 🧩 Attributes + +Attributes are optional fields that can store extra information for each node. Some predefined attributes include: + +- `BoundingBox`: A box that specifies the position of a text in the page. +- `DocumentAttributes`: General attributes for the document. +- `PageAttributes`: Specific attributes like page number for the page. +- `TextAttributes`: Attributes such as bounding boxes for the text. + +## πŸ—οΈ Installation + +The library `document-model-python` is distributed with PyPI, and you can easily install it with `pip`: + +```bash +pip install document-model-python +``` + +## πŸš€ Quick Example + +Here’s how you can represent a simple document with one page and some text: + +```python +from document_model_python.document import Document, Page, Text + +doc = Document( + category="doc", + content=[ + Page( + category="page", + content=[ + Text( + category="heading", + content="Welcome to document-model-python", + marks=["bold"] + ), + Text( + category="body", + content="This is an example text using the document model." + ) + ] + ) + ] +) +``` + +## πŸ’‘ Contributing + +Feel free to submit issues or contribute to the development of this library. We appreciate your feedback! + +## πŸ“œ License + +This project is licensed under the MIT License. From 30e5cfeacf3401608036805bafb249aca245d838 Mon Sep 17 00:00:00 2001 From: Andrea Ponti Date: Mon, 16 Sep 2024 14:22:48 +0200 Subject: [PATCH 08/14] Add retro-compatibility with role and deprecation warning --- document_model_python/document.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/document_model_python/document.py b/document_model_python/document.py index 15b9bb0..3e7d281 100644 --- a/document_model_python/document.py +++ b/document_model_python/document.py @@ -48,20 +48,30 @@ class Text(Node): title="Content", description="The new field to hold the text content." ) + marks: list[Mark] = [] text: Optional[str] = Field( None, title="Text", description="(Deprecated) This field is deprecated and will be removed in a future version. " "Use `content` instead." ) - marks: list[Mark] = [] + role: Optional[str] = Field( + None, + title="Node Type", + description="(Deprecated) This field is deprecated and will be removed in a future version. " + "Use `category` instead." + ) @model_validator(mode="before") - def handle_deprecated_text(self): + def handle_deprecations(self): if "text" in self and "content" not in self: warnings.warn("The use of `text` is deprecated and will be removed in a future version. " "Use `content` instead.", DeprecationWarning) self["content"] = self["text"] + if "role" in self and "category" not in self: + warnings.warn("The use of `role` is deprecated and will be removed in a future version. " + "Use `category` instead.", DeprecationWarning) + self["category"] = self["role"] return self From d719d13a3ba2ebd56d51dcd721d2b3e56bccb322 Mon Sep 17 00:00:00 2001 From: Andrea Ponti Date: Mon, 16 Sep 2024 14:24:11 +0200 Subject: [PATCH 09/14] Update README.md --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 7408cf6..e8c9a8c 100644 --- a/README.md +++ b/README.md @@ -38,8 +38,9 @@ This extends the `Node` class and can contain other nodes as content. It is used This is a leaf node and contains the actual text content: - `content`: The main text content. -- `text`: Deprecated field, use `content` instead. - `marks`: List of text marks like bold, italic, text style, etc. +- `text`: Deprecated field, use `content` instead. +- `role`: Deprecated field, use `category` instead. ### 4. **Page** From e8f28039c3e884858f0c01e04d23375b456611a1 Mon Sep 17 00:00:00 2001 From: Andrea Ponti Date: Tue, 17 Sep 2024 10:12:12 +0200 Subject: [PATCH 10/14] Rename main module --- document_model_python/__init__.py | 0 parse_document_model/__init__.py | 1 + .../attributes.py | 0 .../document.py | 10 +++++----- .../marks.py | 2 +- setup.py | 2 +- 6 files changed, 8 insertions(+), 7 deletions(-) delete mode 100644 document_model_python/__init__.py create mode 100644 parse_document_model/__init__.py rename {document_model_python => parse_document_model}/attributes.py (100%) rename {document_model_python => parse_document_model}/document.py (89%) rename {document_model_python => parse_document_model}/marks.py (96%) diff --git a/document_model_python/__init__.py b/document_model_python/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/parse_document_model/__init__.py b/parse_document_model/__init__.py new file mode 100644 index 0000000..095cb1f --- /dev/null +++ b/parse_document_model/__init__.py @@ -0,0 +1 @@ +from .document import Document, Page diff --git a/document_model_python/attributes.py b/parse_document_model/attributes.py similarity index 100% rename from document_model_python/attributes.py rename to parse_document_model/attributes.py diff --git a/document_model_python/document.py b/parse_document_model/document.py similarity index 89% rename from document_model_python/document.py rename to parse_document_model/document.py index 3e7d281..ef60e77 100644 --- a/document_model_python/document.py +++ b/parse_document_model/document.py @@ -1,11 +1,11 @@ import warnings from abc import ABC -from typing import List, Optional +from typing import List, Optional, Union from pydantic import BaseModel, Field, model_validator -from document_model_python.attributes import Attributes, PageAttributes, TextAttributes, DocumentAttributes -from document_model_python.marks import Mark +from parse_document_model.attributes import Attributes, PageAttributes, TextAttributes, DocumentAttributes +from parse_document_model.marks import Mark, TextStyleMark, UrlMark class Node(BaseModel, ABC): @@ -48,7 +48,7 @@ class Text(Node): title="Content", description="The new field to hold the text content." ) - marks: list[Mark] = [] + marks: list[Union[Mark, TextStyleMark, UrlMark]] = [] text: Optional[str] = Field( None, title="Text", @@ -79,7 +79,7 @@ class Page(Node): """The node that represents a document's page.""" category: str = "page" attributes: Optional[PageAttributes] = None - content = list[Text] + content: list[Text] class Document(Node): diff --git a/document_model_python/marks.py b/parse_document_model/marks.py similarity index 96% rename from document_model_python/marks.py rename to parse_document_model/marks.py index 1704101..1662c56 100644 --- a/document_model_python/marks.py +++ b/parse_document_model/marks.py @@ -22,7 +22,7 @@ class Mark(BaseModel): @model_validator(mode='before') def check_details(self: Any) -> Any: - mark_type = self.get('type') + mark_type = self.get('category') if mark_type == 'textStyle': if 'color' not in self and 'font' not in self: diff --git a/setup.py b/setup.py index 2ac72eb..68b34b3 100644 --- a/setup.py +++ b/setup.py @@ -27,7 +27,7 @@ 'Programming Language :: Python', 'Operating System :: OS Independent' ], - packages=['document_model_python'], + packages=['parse_document_model'], include_package_data=True, install_requires=['pydantic>=2.9.0'] ) From 57e13c038a72c24444eb023a6509d8a19e425bde Mon Sep 17 00:00:00 2001 From: Andrea Ponti Date: Tue, 17 Sep 2024 10:12:27 +0200 Subject: [PATCH 11/14] Add unit test with pytest --- requirements-dev.txt | 1 + test/__init__.py | 0 test/data/extract-text-1.json | 248 ++++++++++++++++++++++++++++++ test/data/extract-text-2.json | 37 +++++ test/data/extract-text-empty.json | 12 ++ test/test_validation.py | 68 ++++++++ 6 files changed, 366 insertions(+) create mode 100644 test/__init__.py create mode 100644 test/data/extract-text-1.json create mode 100644 test/data/extract-text-2.json create mode 100644 test/data/extract-text-empty.json create mode 100644 test/test_validation.py diff --git a/requirements-dev.txt b/requirements-dev.txt index c12813c..7a59972 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,3 +1,4 @@ +pytest~=8.3.3 setuptools~=75.0.0 twine~=5.1.1 wheel~=0.44.0 diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/test/data/extract-text-1.json b/test/data/extract-text-1.json new file mode 100644 index 0000000..2d83f2f --- /dev/null +++ b/test/data/extract-text-1.json @@ -0,0 +1,248 @@ +{ + "category": "doc", + "content": [ + { + "category": "page", + "attributes": { + "page": 1 + }, + "content": [ + { + "role": "page-header", + "text": "Type of document / Offer / Contract / Report", + "marks": [ + { + "category": "textStyle", + "color": { + "r": 78, + "b": 189, + "g": 128, + "id": "color-1" + }, + "font": { + "name": "fira sans", + "id": "font-300", + "size": 18 + } + } + ], + "attributes": { + "bounding_box": [ + { + "min_x": 62.1, + "min_y": 565.0, + "max_x": 427.2, + "max_y": 577.6, + "page": 1 + } + ] + } + }, + { + "role": "title", + "text": "This is the title of the document, it", + "marks": [ + { + "category": "textStyle", + "color": { + "r": 0, + "b": 0, + "g": 0, + "id": "color-0" + }, + "font": { + "name": "fira sans", + "id": "font-300", + "size": 30 + } + } + ], + "attributes": { + "bounding_box": [ + { + "min_x": 62.1, + "min_y": 532.1, + "max_x": 514.6, + "max_y": 554.7, + "page": 1 + } + ] + } + }, + { + "role": "heading", + "text": "can use multiple lines and grow a bit", + "marks": [ + { + "category": "textStyle", + "color": { + "r": 0, + "b": 0, + "g": 0, + "id": "color-0" + }, + "font": { + "name": "fira sans", + "id": "font-300", + "size": 30 + } + } + ], + "attributes": { + "bounding_box": [ + { + "min_x": 62.1, + "min_y": 496.1, + "max_x": 503.0, + "max_y": 518.7, + "page": 1 + }, + { + "min_x": 62.1, + "min_y": 460.1, + "max_x": 98.7, + "max_y": 482.7, + "page": 1 + } + ] + } + }, + { + "role": "heading", + "text": "Subtitle of the document", + "marks": [ + { + "category": "textStyle", + "color": { + "r": 247, + "b": 70, + "g": 150, + "id": "color-2" + }, + "font": { + "name": "fira sans", + "id": "font-300", + "size": 22 + } + } + ], + "attributes": { + "bounding_box": [ + { + "min_x": 62.1, + "min_y": 431.6, + "max_x": 296.5, + "max_y": 447.1, + "page": 1 + } + ] + } + }, + { + "role": "body", + "text": "OneOff-Tech UG", + "marks": [ + { + "category": "textStyle", + "color": { + "r": 0, + "b": 0, + "g": 0, + "id": "color-0" + }, + "font": { + "name": "fira sans-bold", + "id": "font-301", + "size": 11 + } + }, + { + "category": "bold" + } + ], + "attributes": { + "bounding_box": [ + { + "min_x": 62.1, + "min_y": 209.0, + "max_x": 253.5, + "max_y": 217.5, + "page": 1 + } + ] + } + } + ] + }, + { + "category": "page", + "attributes": { + "page": 2 + }, + "content": [ + { + "role": "heading", + "text": "1 First chapter", + "marks": [ + { + "category": "textStyle", + "color": { + "r": 53, + "b": 145, + "g": 96, + "id": "color-4" + }, + "font": { + "name": "fira sans", + "id": "font-300", + "size": 20 + } + } + ], + "attributes": { + "bounding_box": [ + { + "min_x": 56.7, + "min_y": 702.8, + "max_x": 193.9, + "max_y": 717.8, + "page": 2 + } + ] + } + }, + { + "role": "body", + "text": "This is an example text.", + "marks": [ + { + "category": "textStyle", + "color": { + "r": 0, + "b": 0, + "g": 0, + "id": "color-0" + }, + "font": { + "name": "fira sans", + "id": "font-300", + "size": 11 + } + } + ], + "attributes": { + "bounding_box": [ + { + "min_x": 56.7, + "min_y": 665.0, + "max_x": 504.2, + "max_y": 687.3, + "page": 2 + } + ] + } + } + ] + } + ] +} \ No newline at end of file diff --git a/test/data/extract-text-2.json b/test/data/extract-text-2.json new file mode 100644 index 0000000..a3fcd23 --- /dev/null +++ b/test/data/extract-text-2.json @@ -0,0 +1,37 @@ +{ + "category": "doc", + "content": [ + { + "category": "page", + "attributes": { + "page": 1 + }, + "content": [ + { + "role": "body", + "text": "Type of document / Offer / Contract / Report This is the title of the document, it can use multiple lines and grow a bit Subtitle of the document OneOff-Tech", + "marks": [], + "attributes": { + "bounding_box": [] + } + } + ] + }, + { + "category": "page", + "attributes": { + "page": 2 + }, + "content": [ + { + "role": "body", + "text": "Section Heading 1 First chapter This is an example text.", + "marks": [], + "attributes": { + "bounding_box": [] + } + } + ] + } + ] +} \ No newline at end of file diff --git a/test/data/extract-text-empty.json b/test/data/extract-text-empty.json new file mode 100644 index 0000000..cd6a8c7 --- /dev/null +++ b/test/data/extract-text-empty.json @@ -0,0 +1,12 @@ +{ + "category": "doc", + "content": [ + { + "category": "page", + "attributes": { + "page": 1 + }, + "content": [] + } + ] +} \ No newline at end of file diff --git a/test/test_validation.py b/test/test_validation.py new file mode 100644 index 0000000..339764f --- /dev/null +++ b/test/test_validation.py @@ -0,0 +1,68 @@ +import json + +import pytest + +from parse_document_model import Document, Page +from parse_document_model.attributes import PageAttributes, TextAttributes +from parse_document_model.marks import Mark, TextStyleMark, UrlMark + + +def test_read_from_json(): + filepaths = ["test/data/extract-text-1.json", + "test/data/extract-text-2.json", + "test/data/extract-text-empty.json"] + for filepath in filepaths: + doc_json = json.load(open(filepath, "r")) + doc = Document(**doc_json) + + # Check the Document + assert doc.category == "doc" + assert isinstance(doc.content, list) + + # Check the Page + for page in doc.content: + assert isinstance(page, Page) + assert page.category == "page" + assert isinstance(page.attributes, PageAttributes) + assert isinstance(page.content, list) + + # Check Text + for text in page.content: + assert text.category in ["page-header", "title", "heading", "body", "footer"] + assert isinstance(text.content, str) + assert isinstance(text.attributes, TextAttributes) + assert isinstance(text.marks, list) + + # Check Marks + for mark in text.marks: + assert isinstance(mark, Mark) + + +def test_style_marks(): + text_style_mark_json = [{"category": "textStyle", "font": {"id": "1", "name": "test-font", "size": 1}}, + {"category": "textStyle", "color": {"id": "1", "r": 0, "g": 0, "b": 0}}, + {"category": "textStyle", "font": {"id": "1", "name": "test-font", "size": 1}, + "color": {"id": "1", "r": 0, "g": 0, "b": 0}}, + {"category": "textStyle"}, + {"category": "textStyle", "url": "test-url"}] + for mark_json in text_style_mark_json: + if "font" in mark_json or "color" in mark_json: + mark = TextStyleMark(**mark_json) + assert isinstance(mark, TextStyleMark) + else: + with pytest.raises(ValueError): + TextStyleMark(**mark_json) + + +def test_url_marks(): + url_mark_json = [{"category": "link", "url": "test-url"}, + {"category": "link"}, + {"category": "link", "font": {"id": "1", "name": "test-font", "size": 1}}, + {"category": "link", "color": {"id": "1", "r": 0, "g": 0, "b": 0}}] + for mark_json in url_mark_json: + if "url" in mark_json: + mark = UrlMark(**mark_json) + assert isinstance(mark, UrlMark) + else: + with pytest.raises(ValueError): + UrlMark(**mark_json) From a366f4e824ac0b1c187ca69febc4efbb85d8f4a2 Mon Sep 17 00:00:00 2001 From: Alessio Vertemati Date: Tue, 17 Sep 2024 12:35:04 +0200 Subject: [PATCH 12/14] Fix document and page not inherit from StructuredNode --- parse_document_model/document.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parse_document_model/document.py b/parse_document_model/document.py index ef60e77..6f1e9db 100644 --- a/parse_document_model/document.py +++ b/parse_document_model/document.py @@ -75,14 +75,14 @@ def handle_deprecations(self): return self -class Page(Node): +class Page(StructuredNode): """The node that represents a document's page.""" category: str = "page" attributes: Optional[PageAttributes] = None content: list[Text] -class Document(Node): +class Document(StructuredNode): """The root node of a document.""" category: str = "doc" attributes: Optional[DocumentAttributes] = None From 674408aef45d16df72e477d6bcf8c2d80bd5b47e Mon Sep 17 00:00:00 2001 From: Alessio Vertemati Date: Tue, 17 Sep 2024 13:58:39 +0200 Subject: [PATCH 13/14] Update redme structure and add necessary GitHub files --- .github/CONTRIBUTING.md | 54 +++++++++++ .github/SECURITY.md | 3 + LICENSE | 2 +- README.md | 193 ++++++++++++++++++++++++++++------------ setup.py | 8 +- 5 files changed, 200 insertions(+), 60 deletions(-) create mode 100644 .github/CONTRIBUTING.md create mode 100644 .github/SECURITY.md diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md new file mode 100644 index 0000000..b8f4112 --- /dev/null +++ b/.github/CONTRIBUTING.md @@ -0,0 +1,54 @@ +# Contributing + +Contributions are **welcome** and will be fully **credited**. + +Please read and understand the contribution guide before creating an issue or pull request. + +## Etiquette + +This project is open source, and as such, the maintainers give their free time to build and maintain the source code held within. They make the code freely available in the hope that it will be of use to other developers. It would be extremely unfair for them to suffer abuse or anger for their hard work. + +Please be considerate towards maintainers when raising issues or presenting pull requests. Let's show the +world that developers are civilized and selfless people. + +It's the duty of the maintainer to ensure that all submissions to the project are of sufficient +quality to benefit the project. Many developers have different skillsets, strengths, and weaknesses. Respect the maintainer's decision, and do not be upset or abusive if your submission is not used. + +## Viability + +When requesting or submitting new features, first consider whether it might be useful to others. Open +source projects are used by many developers, who may have entirely different needs to your own. Think about +whether or not your feature is likely to be used by other users of the project. + +## Procedure + +> [!NOTE] +> Issue tracking is not currently enabled for this repository. We are organising it. + +Before filing an issue: + +- Attempt to replicate the problem, to ensure that it wasn't a coincidental incident. +- Check to make sure your feature suggestion isn't already present within the project. +- Check the pull requests tab to ensure that the bug doesn't have a fix in progress. +- Check the pull requests tab to ensure that the feature isn't already in progress. + +Before submitting a pull request: + +- Check the codebase to ensure that your feature doesn't already exist. +- Check the pull requests to ensure that another person hasn't already submitted the feature or fix. + +## Requirements + +If the project maintainer has any additional requirements, you will find them listed here. + +- **Add tests!** - Your patch won't be accepted if it doesn't have tests. + +- **Document any change in behaviour** - Make sure the `README.md` and any other relevant documentation are kept up-to-date. + +- **Consider our release cycle** - We try to follow [SemVer v2.0.0](https://semver.org/). Randomly breaking public APIs is not an option. + +- **One pull request per feature** - If you want to do more than one thing, send multiple pull requests. + +- **Send coherent history** - Make sure each individual commit in your pull request is meaningful. If you had to make multiple intermediate commits while developing, please [squash them](https://www.git-scm.com/book/en/v2/Git-Tools-Rewriting-History#Changing-Multiple-Commit-Messages) before submitting. + +**Happy coding**! \ No newline at end of file diff --git a/.github/SECURITY.md b/.github/SECURITY.md new file mode 100644 index 0000000..8e032a3 --- /dev/null +++ b/.github/SECURITY.md @@ -0,0 +1,3 @@ +# Security Policy + +If you discover any security related issues, please email security@oneofftech.xyz instead of using the discussions or the issue tracker. \ No newline at end of file diff --git a/LICENSE b/LICENSE index 5d0b89e..d7f078f 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2024 OneOffTech +Copyright (c) OneOffTech Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index e8c9a8c..d02a276 100644 --- a/README.md +++ b/README.md @@ -1,95 +1,139 @@ -![pypi](https://img.shields.io/pypi/v/document-model-python.svg) +![pypi](https://img.shields.io/pypi/v/parse-document-model-python.svg) [![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://docs.pydantic.dev/latest/contributing/#badges) [![License](https://img.shields.io/badge/license-MIT-green.svg)](LICENSE) -# :card_file_box: Document Model Python +# Parse Document Model (Python) -**Document Model Python** is a library for representing text documents using a hierarchical model. -This library allows you to define documents as a collection of nodes where each node can represent a document, page, -text, heading, body, and more. +**Parse Document Model** (Python) provides Pydantic models for representing text documents using a hierarchical model. +This library allows you to define documents as a hierarchy of (specialised) nodes where each node can represent a document, page, text, heading, body, and more. -## 🌟 Key Features +These models aim to preserve the underlying structure of text documents for further processing, such as creating a table of contents or transforming between formats, e.g. converting a parsed PDF to Markdown. -- **Hierarchical Structure**: The document is modeled as a hierarchy of nodes. Each node could represent a part of the -document such as a document itself, pages, text. -- **Rich Text Support**: Nodes can represent not only the content but also the marks (e.g., bold, italic) applied to -the text. -- **Attributes**: Every node can have attributes that provide additional information such as page number, +- **Hierarchical structure**: The document is modelled as a hierarchy of nodes. Each node can represent a part of the +document itself, pages, text. +- **Rich text support**: Nodes can represent not only the content but also the formatting (e.g. bold, italic) applied to the text. +- **Attributes**: Each node can have attributes that provide additional information such as page number, bounding box, etc. -- **Easy-to-use**: Built with [`Pydantic`](https://docs.pydantic.dev/latest/), ensures type validation and effortless -creation of complex document structures. +- **Built-in validation and types**: Built with [`Pydantic`](https://docs.pydantic.dev/latest/), ensuring type safety, validation and effortless creation of complex document structures. -## πŸ“š Structure Overview -### 1. **Node** (Base Class) +**Requirements** + +- Python 3.12 or above (Python 3.9, 3.10 and 3.11 are supported on best-effort). + + +**Next steps** + +- [Explore the document model](#document-model-overview) +- [Install the library and use the models](#getting-started) + + +## Document Model Overview + +We want to represent the document structure using a hierarchy so that the inherited structure is preserved when chapters, sections and headings are used. Consider a generic document with two pages, one heading per page and one paragraph of text. The resulting representation might be the following. + +``` +Document + β”œβ”€Page + β”‚ β”œβ”€Text (category: heading) + β”‚ └─Text (category: body) + └─Page + β”œβ”€Text (category: heading) + └─Text (category: body) +``` + +At a glance you can see the structure, the document is composed of two pages and there are two headings. To do so we defined a hierarchy around the concept of a Node, like a node in a graph. + +### Node types + +```mermaid +classDiagram + class Node + Node <|-- StructuredNode + Node <|-- Text + StructuredNode <|-- Document + StructuredNode <|-- Page +``` + + +#### 1. **Node** (Base Class) + +This is the abstract class from which all other nodes inherit. -The base element of the document is a `Node`. This is the abstract class from which all other nodes inherit. Each node has: + - `category`: The type of the node (e.g., `doc`, `page`, `heading`). -- `attributes`: Optional field to attach extra data to a node. +- `attributes`: Optional field to attach extra data to a node. See [Attributes](#attributes). + +#### 2. **StructuredNode** -### 2. **StructuredNode** +This extends the [`Node`](#1-node-base-class). It is used to represent the hierarchy as a node whose content is a list of other nodes, such as like [`Document`](#3-document) and [`Page`](#4-page). -This extends the `Node` class and can contain other nodes as content. It is used for non-leaf nodes like -`Document` and `Page`. +- `content`: List of `Node`. -### 3. **Text** -This is a leaf node and contains the actual text content: +#### 3. **Document** -- `content`: The main text content. -- `marks`: List of text marks like bold, italic, text style, etc. -- `text`: Deprecated field, use `content` instead. -- `role`: Deprecated field, use `category` instead. +This is the root node of a document. -### 4. **Page** +- `category`: Always set to `"doc"`. +- `attributes`: Document-wide attributes can be set here. +- `content`: List of [`Page`](#4-page) nodes that form the document. + +#### 4. **Page** Represents a page in the document: - `category`: Always set to `"page"`. - `attributes`: Can contain metadata like page number. -- `content`: List of `Text` nodes on the page. +- `content`: List of [`Text`](#5-text) nodes on the page. -### 5. **Document** +#### 5. **Text** -This is the root node of the document: +This node represent a paragraph, a heading or any text within the document. + +- `category`: The type `"doc"`. +- `content`: A string representing the textual content. +- `marks`: List of [marks](#marks) applied to the text, such as bold, italic, etc. +- `attributes`: Can contain metadata like the bounding box representing where this portion of text is located in the page. -- `category`: Always set to `"doc"`. -- `attributes`: Document-wide attributes can be set here. -- `content`: List of `Page` nodes that form the document. -## πŸ–‹οΈ Marks -Marks are used to style or add functionality to the text inside a `Text` node. -For example, bold text, italic text, links, and custom styles like font or color. +### Marks -### **Mark Types** +Marks are used to add style or functionality to the text within a [`Text`](#5-text) node. +For example, bold text, italic text, links and custom styles such as font or colour. -- **Bold**: Represents bold text. -- **Italic**: Represents italic text. -- **TextStyle**: Allows customization of font and color. -- **Link**: Represents a hyperlink. +**Mark Types** + +- `Bold`: Represents bold text. +- `Italic`: Represents italic text. +- `TextStyle`: Allows customization of font and color. +- `Link`: Represents a hyperlink. Marks are validated and enforced with the help of `Pydantic` model validators. -## 🧩 Attributes +### Attributes -Attributes are optional fields that can store extra information for each node. Some predefined attributes include: +Attributes are optional fields that can store additional information for each node. Some predefined attributes are: +- `DocumentAttributes`: General attributes for the document (currently reserved for the future). +- `PageAttributes`: Specific page related attributes, such as the page number. +- `TextAttributes`: Text related attributes, such as bounding boxes. - `BoundingBox`: A box that specifies the position of a text in the page. -- `DocumentAttributes`: General attributes for the document. -- `PageAttributes`: Specific attributes like page number for the page. -- `TextAttributes`: Attributes such as bounding boxes for the text. -## πŸ—οΈ Installation -The library `document-model-python` is distributed with PyPI, and you can easily install it with `pip`: +## Getting started + +### Installation + +Parse Document Model is distributed with PyPI. You can install it with `pip`. ```bash -pip install document-model-python +pip install parse-document-model-python ``` -## πŸš€ Quick Example +### Quick Example Here’s how you can represent a simple document with one page and some text: @@ -104,7 +148,7 @@ doc = Document( content=[ Text( category="heading", - content="Welcome to document-model-python", + content="Welcome to parse-document-model-python", marks=["bold"] ), Text( @@ -117,10 +161,49 @@ doc = Document( ) ``` -## πŸ’‘ Contributing +## Testing + +Parse Document Model is tested using [pytest](https://docs.pytest.org/en/stable/). Tests run for each commit and pull request. + +Install the dependencies. + +```bash +pip install -r requirements.txt -r requirements-dev.txt +``` + +Execute the test suite. + +```bash +pytest +``` + + +## Contributing + +Thank you for considering contributing to the Parse Document Model! The contribution guide can be found in the [CONTRIBUTING.md](./.github/CONTRIBUTING.md) file. + +> [NOTE] +> Consider opening a [discussion](https://github.com/OneOffTech/parse-document-model-python/discussions) before submitting a pull request with changes to the model structures. + +## Security Vulnerabilities + +Please review [our security policy](./.github/SECURITY.md) on how to report security vulnerabilities. + +## Credits + +- [OneOffTech](https://github.com/OneOffTech) +- [All Contributors](../../contributors) + +## Supporters + +The project is provided and supported by [OneOff-Tech (UG)](https://oneofftech.de). + +

+ +## Aknowledgements -Feel free to submit issues or contribute to the development of this library. We appreciate your feedback! +The format and structure takes inspiration from [ProseMirror](https://prosemirror.net/docs/ref/#model.Document_Schema). -## πŸ“œ License +## License -This project is licensed under the MIT License. +The MIT License (MIT). Please see [License File](LICENSE.md) for more information. diff --git a/setup.py b/setup.py index 68b34b3..9d0edb0 100644 --- a/setup.py +++ b/setup.py @@ -11,15 +11,15 @@ setup( name='document-model-python', version='0.1.0', - description='Define the pydantic models for a text document.', + description='Pydantic models for representing a text document as a hierarchical structure.', long_description=long_description, long_description_content_type='text/markdown', author='OneOffTech', - author_email='info@oneofftech.de', + author_email='info@oneofftech.xyz', license='MIT', - url='https://github.com/OneOffTech/document-model-python', + url='https://github.com/OneOffTech/parse-document-model-python', project_urls={ - 'Source': 'https://github.com/OneOffTech/document-model-python', + 'Source': 'https://github.com/OneOffTech/parse-document-model-python', }, classifiers=[ 'Intended Audience :: Developers', From 589d15696d1428777ab71c8f44e67f9b15769285 Mon Sep 17 00:00:00 2001 From: Alessio Vertemati Date: Tue, 17 Sep 2024 13:58:53 +0200 Subject: [PATCH 14/14] Add basic dev container configuration --- .devcontainer/devcontainer.json | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 .devcontainer/devcontainer.json diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 0000000..fcd29af --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,16 @@ +// For format details, see https://aka.ms/devcontainer.json. For config options, see the +// README at: https://github.com/devcontainers/templates/tree/main/src/python +{ + "name": "Python 3", + // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile + "image": "mcr.microsoft.com/devcontainers/python:1-3.12-bullseye", + + // Features to add to the dev container. More info: https://containers.dev/features. + // "features": {}, + + // Use 'postCreateCommand' to run commands after the container is created. + "postCreateCommand": "pip3 install --user -r requirements.txt -r requirements-dev.txt" + + // Configure tool-specific properties. + // "customizations": {} +}