Skip to content

feat: add annotations in MD & HTML serialization #295

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
May 27, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions docling_core/transforms/serializer/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,11 @@
KeyValueItem,
NodeItem,
OrderedList,
PictureClassificationData,
PictureDataType,
PictureDescriptionData,
PictureItem,
PictureMoleculeData,
TableItem,
TextItem,
UnorderedList,
Expand Down Expand Up @@ -118,6 +122,23 @@ def _iterate_items(
yield item


def _get_picture_annotation_text(annotation: PictureDataType) -> Optional[str]:
result = None
if isinstance(annotation, PictureClassificationData):
predicted_class = (
annotation.predicted_classes[0].class_name
if annotation.predicted_classes
else None
)
if predicted_class is not None:
result = predicted_class.replace("_", " ")
elif isinstance(annotation, PictureDescriptionData):
result = annotation.text
elif isinstance(annotation, PictureMoleculeData):
result = annotation.smi
return result


def create_ser_result(
*,
text: str = "",
Expand Down
53 changes: 42 additions & 11 deletions docling_core/transforms/serializer/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
from docling_core.transforms.serializer.common import (
CommonParams,
DocSerializer,
_get_picture_annotation_text,
create_ser_result,
)
from docling_core.transforms.serializer.html_styles import (
Expand Down Expand Up @@ -110,6 +111,8 @@ class HTMLParams(CommonParams):
# Enable charts to be printed into HTML as tables
enable_chart_tables: bool = True

include_annotations: bool = True


class HTMLTextSerializer(BaseModel, BaseTextSerializer):
"""HTML-specific text item serializer."""
Expand Down Expand Up @@ -943,18 +946,46 @@ def serialize_captions(
params = self.params.merge_with_patch(patch=kwargs)
results: list[SerializationResult] = []
text_res = ""
excluded_refs = self.get_excluded_refs(**kwargs)

if DocItemLabel.CAPTION in params.labels:
results = [
create_ser_result(text=it.text, span_source=it)
for cap in item.captions
if isinstance(it := cap.resolve(self.doc), TextItem)
and it.self_ref not in self.get_excluded_refs(**kwargs)
]
text_res = params.caption_delim.join([r.text for r in results])
if text_res:
text_dir = get_text_direction(text_res)
dir_str = f' dir="{text_dir}"' if text_dir == "rtl" else ""
text_res = f"<{tag}{dir_str}>{html.escape(text_res)}</{tag}>"
for cap in item.captions:
if (
isinstance(it := cap.resolve(self.doc), TextItem)
and it.self_ref not in excluded_refs
):
text_cap = it.text
text_dir = get_text_direction(text_cap)
dir_str = f' dir="{text_dir}"' if text_dir == "rtl" else ""
cap_ser_res = create_ser_result(
text=(
f'<div class="caption"{dir_str}>'
f"{html.escape(text_cap)}"
f"</div>"
),
span_source=it,
)
results.append(cap_ser_res)

if params.include_annotations and item.self_ref not in excluded_refs:
if isinstance(item, PictureItem):
for ann in item.annotations:
if ann_text := _get_picture_annotation_text(annotation=ann):
text_dir = get_text_direction(ann_text)
dir_str = f' dir="{text_dir}"' if text_dir == "rtl" else ""
ann_ser_res = create_ser_result(
text=(
f'<div data-annotation-kind="{ann.kind}"{dir_str}>'
f"{html.escape(ann_text)}"
f"</div>"
),
span_source=item,
)
results.append(ann_ser_res)

text_res = params.caption_delim.join([r.text for r in results])
if text_res:
text_res = f"<{tag}>{text_res}</{tag}>"
return create_ser_result(text=text_res, span_source=results)

def _generate_head(self) -> str:
Expand Down
21 changes: 21 additions & 0 deletions docling_core/transforms/serializer/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from docling_core.transforms.serializer.common import (
CommonParams,
DocSerializer,
_get_picture_annotation_text,
_PageBreakSerResult,
create_ser_result,
)
Expand Down Expand Up @@ -69,6 +70,8 @@ class MarkdownParams(CommonParams):
page_break_placeholder: Optional[str] = None # e.g. "<!-- page break -->"
escape_underscores: bool = True
escape_html: bool = True
include_annotations: bool = True
mark_annotations: bool = False


class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
Expand Down Expand Up @@ -210,6 +213,24 @@ def serialize(
res_parts.append(cap_res)

if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
if params.include_annotations:

for ann in item.annotations:
if ann_text := _get_picture_annotation_text(annotation=ann):
ann_ser_res = create_ser_result(
text=(
(
f'<!--<annotation kind="{ann.kind}">-->'
f"{ann_text}"
f"<!--<annotation/>-->"
)
if params.mark_annotations
else ann_text
),
span_source=item,
)
res_parts.append(ann_ser_res)

img_res = self._serialize_image_part(
item=item,
doc=doc,
Expand Down
20 changes: 18 additions & 2 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -2925,6 +2925,7 @@ def save_as_markdown(
page_no: Optional[int] = None,
included_content_layers: Optional[set[ContentLayer]] = None,
page_break_placeholder: Optional[str] = None,
include_annotations: bool = True,
):
"""Save to markdown."""
if isinstance(filename, str):
Expand Down Expand Up @@ -2952,6 +2953,7 @@ def save_as_markdown(
page_no=page_no,
included_content_layers=included_content_layers,
page_break_placeholder=page_break_placeholder,
include_annotations=include_annotations,
)

with open(filename, "w", encoding="utf-8") as fw:
Expand All @@ -2973,6 +2975,8 @@ def export_to_markdown( # noqa: C901
page_no: Optional[int] = None,
included_content_layers: Optional[set[ContentLayer]] = None,
page_break_placeholder: Optional[str] = None, # e.g. "<!-- page break -->",
include_annotations: bool = True,
mark_annotations: bool = False,
) -> str:
r"""Serialize to Markdown.
Expand All @@ -2992,9 +2996,9 @@ def export_to_markdown( # noqa: C901
:type labels: Optional[set[DocItemLabel]] = None
:param strict_text: Deprecated.
:type strict_text: bool = False
:param escaping_underscores: bool: Whether to escape underscores in the
:param escape_underscores: bool: Whether to escape underscores in the
text content of the document. (Default value = True).
:type escaping_underscores: bool = True
:type escape_underscores: bool = True
:param image_placeholder: The placeholder to include to position
images in the markdown. (Default value = "\<!-- image --\>").
:type image_placeholder: str = "<!-- image -->"
Expand All @@ -3010,6 +3014,12 @@ def export_to_markdown( # noqa: C901
:param page_break_placeholder: The placeholder to include for marking page
breaks. None means no page break placeholder will be used.
:type page_break_placeholder: Optional[str] = None
:param include_annotations: bool: Whether to include annotations in the export.
(Default value = True).
:type include_annotations: bool = True
:param mark_annotations: bool: Whether to mark annotations in the export; only
relevant if include_annotations is True. (Default value = False).
:type mark_annotations: bool = False
:returns: The exported Markdown representation.
:rtype: str
"""
Expand Down Expand Up @@ -3039,6 +3049,8 @@ def export_to_markdown( # noqa: C901
indent=indent,
wrap_width=text_width if text_width > 0 else None,
page_break_placeholder=page_break_placeholder,
include_annotations=include_annotations,
mark_annotations=mark_annotations,
),
)
ser_res = serializer.serialize()
Expand Down Expand Up @@ -3088,6 +3100,7 @@ def save_as_html(
html_head: str = "null", # should be deprecated
included_content_layers: Optional[set[ContentLayer]] = None,
split_page_view: bool = False,
include_annotations: bool = True,
):
"""Save to HTML."""
if isinstance(filename, str):
Expand All @@ -3113,6 +3126,7 @@ def save_as_html(
html_head=html_head,
included_content_layers=included_content_layers,
split_page_view=split_page_view,
include_annotations=include_annotations,
)

with open(filename, "w", encoding="utf-8") as fw:
Expand Down Expand Up @@ -3165,6 +3179,7 @@ def export_to_html( # noqa: C901
html_head: str = "null", # should be deprecated ...
included_content_layers: Optional[set[ContentLayer]] = None,
split_page_view: bool = False,
include_annotations: bool = True,
) -> str:
r"""Serialize to HTML."""
from docling_core.transforms.serializer.html import (
Expand Down Expand Up @@ -3196,6 +3211,7 @@ def export_to_html( # noqa: C901
html_head=html_head,
html_lang=html_lang,
output_style=output_style,
include_annotations=include_annotations,
)

if html_head == "null":
Expand Down
Loading