Skip to content

Commit f067c51

Browse files
vagenasdolfim-ibm
andauthored
feat: add annotations in MD & HTML serialization (#295)
* feat: include annotations in MD & HTML serialization Signed-off-by: Panos Vagenas <pva@zurich.ibm.com> * (HTML) move annotations into figcaptions Signed-off-by: Panos Vagenas <pva@zurich.ibm.com> * add explicit beginning/end markers, fix case of excluded refs Signed-off-by: Panos Vagenas <pva@zurich.ibm.com> * improve annotation marking, extend tests Signed-off-by: Panos Vagenas <pva@zurich.ibm.com> * wrap captions (#305) Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * revert temp test changes Signed-off-by: Panos Vagenas <pva@zurich.ibm.com> --------- Signed-off-by: Panos Vagenas <pva@zurich.ibm.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
1 parent 4a174b5 commit f067c51

26 files changed

+8861
-3038
lines changed

docling_core/transforms/serializer/common.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,11 @@
3939
KeyValueItem,
4040
NodeItem,
4141
OrderedList,
42+
PictureClassificationData,
43+
PictureDataType,
44+
PictureDescriptionData,
4245
PictureItem,
46+
PictureMoleculeData,
4347
TableItem,
4448
TextItem,
4549
UnorderedList,
@@ -118,6 +122,23 @@ def _iterate_items(
118122
yield item
119123

120124

125+
def _get_picture_annotation_text(annotation: PictureDataType) -> Optional[str]:
126+
result = None
127+
if isinstance(annotation, PictureClassificationData):
128+
predicted_class = (
129+
annotation.predicted_classes[0].class_name
130+
if annotation.predicted_classes
131+
else None
132+
)
133+
if predicted_class is not None:
134+
result = predicted_class.replace("_", " ")
135+
elif isinstance(annotation, PictureDescriptionData):
136+
result = annotation.text
137+
elif isinstance(annotation, PictureMoleculeData):
138+
result = annotation.smi
139+
return result
140+
141+
121142
def create_ser_result(
122143
*,
123144
text: str = "",

docling_core/transforms/serializer/html.py

Lines changed: 42 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
from docling_core.transforms.serializer.common import (
3636
CommonParams,
3737
DocSerializer,
38+
_get_picture_annotation_text,
3839
create_ser_result,
3940
)
4041
from docling_core.transforms.serializer.html_styles import (
@@ -110,6 +111,8 @@ class HTMLParams(CommonParams):
110111
# Enable charts to be printed into HTML as tables
111112
enable_chart_tables: bool = True
112113

114+
include_annotations: bool = True
115+
113116

114117
class HTMLTextSerializer(BaseModel, BaseTextSerializer):
115118
"""HTML-specific text item serializer."""
@@ -943,18 +946,46 @@ def serialize_captions(
943946
params = self.params.merge_with_patch(patch=kwargs)
944947
results: list[SerializationResult] = []
945948
text_res = ""
949+
excluded_refs = self.get_excluded_refs(**kwargs)
950+
946951
if DocItemLabel.CAPTION in params.labels:
947-
results = [
948-
create_ser_result(text=it.text, span_source=it)
949-
for cap in item.captions
950-
if isinstance(it := cap.resolve(self.doc), TextItem)
951-
and it.self_ref not in self.get_excluded_refs(**kwargs)
952-
]
953-
text_res = params.caption_delim.join([r.text for r in results])
954-
if text_res:
955-
text_dir = get_text_direction(text_res)
956-
dir_str = f' dir="{text_dir}"' if text_dir == "rtl" else ""
957-
text_res = f"<{tag}{dir_str}>{html.escape(text_res)}</{tag}>"
952+
for cap in item.captions:
953+
if (
954+
isinstance(it := cap.resolve(self.doc), TextItem)
955+
and it.self_ref not in excluded_refs
956+
):
957+
text_cap = it.text
958+
text_dir = get_text_direction(text_cap)
959+
dir_str = f' dir="{text_dir}"' if text_dir == "rtl" else ""
960+
cap_ser_res = create_ser_result(
961+
text=(
962+
f'<div class="caption"{dir_str}>'
963+
f"{html.escape(text_cap)}"
964+
f"</div>"
965+
),
966+
span_source=it,
967+
)
968+
results.append(cap_ser_res)
969+
970+
if params.include_annotations and item.self_ref not in excluded_refs:
971+
if isinstance(item, PictureItem):
972+
for ann in item.annotations:
973+
if ann_text := _get_picture_annotation_text(annotation=ann):
974+
text_dir = get_text_direction(ann_text)
975+
dir_str = f' dir="{text_dir}"' if text_dir == "rtl" else ""
976+
ann_ser_res = create_ser_result(
977+
text=(
978+
f'<div data-annotation-kind="{ann.kind}"{dir_str}>'
979+
f"{html.escape(ann_text)}"
980+
f"</div>"
981+
),
982+
span_source=item,
983+
)
984+
results.append(ann_ser_res)
985+
986+
text_res = params.caption_delim.join([r.text for r in results])
987+
if text_res:
988+
text_res = f"<{tag}>{text_res}</{tag}>"
958989
return create_ser_result(text=text_res, span_source=results)
959990

960991
def _generate_head(self) -> str:

docling_core/transforms/serializer/markdown.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
from docling_core.transforms.serializer.common import (
3030
CommonParams,
3131
DocSerializer,
32+
_get_picture_annotation_text,
3233
_PageBreakSerResult,
3334
create_ser_result,
3435
)
@@ -69,6 +70,8 @@ class MarkdownParams(CommonParams):
6970
page_break_placeholder: Optional[str] = None # e.g. "<!-- page break -->"
7071
escape_underscores: bool = True
7172
escape_html: bool = True
73+
include_annotations: bool = True
74+
mark_annotations: bool = False
7275

7376

7477
class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
@@ -210,6 +213,24 @@ def serialize(
210213
res_parts.append(cap_res)
211214

212215
if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
216+
if params.include_annotations:
217+
218+
for ann in item.annotations:
219+
if ann_text := _get_picture_annotation_text(annotation=ann):
220+
ann_ser_res = create_ser_result(
221+
text=(
222+
(
223+
f'<!--<annotation kind="{ann.kind}">-->'
224+
f"{ann_text}"
225+
f"<!--<annotation/>-->"
226+
)
227+
if params.mark_annotations
228+
else ann_text
229+
),
230+
span_source=item,
231+
)
232+
res_parts.append(ann_ser_res)
233+
213234
img_res = self._serialize_image_part(
214235
item=item,
215236
doc=doc,

docling_core/types/doc/document.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2925,6 +2925,7 @@ def save_as_markdown(
29252925
page_no: Optional[int] = None,
29262926
included_content_layers: Optional[set[ContentLayer]] = None,
29272927
page_break_placeholder: Optional[str] = None,
2928+
include_annotations: bool = True,
29282929
):
29292930
"""Save to markdown."""
29302931
if isinstance(filename, str):
@@ -2952,6 +2953,7 @@ def save_as_markdown(
29522953
page_no=page_no,
29532954
included_content_layers=included_content_layers,
29542955
page_break_placeholder=page_break_placeholder,
2956+
include_annotations=include_annotations,
29552957
)
29562958

29572959
with open(filename, "w", encoding="utf-8") as fw:
@@ -2973,6 +2975,8 @@ def export_to_markdown( # noqa: C901
29732975
page_no: Optional[int] = None,
29742976
included_content_layers: Optional[set[ContentLayer]] = None,
29752977
page_break_placeholder: Optional[str] = None, # e.g. "<!-- page break -->",
2978+
include_annotations: bool = True,
2979+
mark_annotations: bool = False,
29762980
) -> str:
29772981
r"""Serialize to Markdown.
29782982
@@ -2992,9 +2996,9 @@ def export_to_markdown( # noqa: C901
29922996
:type labels: Optional[set[DocItemLabel]] = None
29932997
:param strict_text: Deprecated.
29942998
:type strict_text: bool = False
2995-
:param escaping_underscores: bool: Whether to escape underscores in the
2999+
:param escape_underscores: bool: Whether to escape underscores in the
29963000
text content of the document. (Default value = True).
2997-
:type escaping_underscores: bool = True
3001+
:type escape_underscores: bool = True
29983002
:param image_placeholder: The placeholder to include to position
29993003
images in the markdown. (Default value = "\<!-- image --\>").
30003004
:type image_placeholder: str = "<!-- image -->"
@@ -3010,6 +3014,12 @@ def export_to_markdown( # noqa: C901
30103014
:param page_break_placeholder: The placeholder to include for marking page
30113015
breaks. None means no page break placeholder will be used.
30123016
:type page_break_placeholder: Optional[str] = None
3017+
:param include_annotations: bool: Whether to include annotations in the export.
3018+
(Default value = True).
3019+
:type include_annotations: bool = True
3020+
:param mark_annotations: bool: Whether to mark annotations in the export; only
3021+
relevant if include_annotations is True. (Default value = False).
3022+
:type mark_annotations: bool = False
30133023
:returns: The exported Markdown representation.
30143024
:rtype: str
30153025
"""
@@ -3039,6 +3049,8 @@ def export_to_markdown( # noqa: C901
30393049
indent=indent,
30403050
wrap_width=text_width if text_width > 0 else None,
30413051
page_break_placeholder=page_break_placeholder,
3052+
include_annotations=include_annotations,
3053+
mark_annotations=mark_annotations,
30423054
),
30433055
)
30443056
ser_res = serializer.serialize()
@@ -3088,6 +3100,7 @@ def save_as_html(
30883100
html_head: str = "null", # should be deprecated
30893101
included_content_layers: Optional[set[ContentLayer]] = None,
30903102
split_page_view: bool = False,
3103+
include_annotations: bool = True,
30913104
):
30923105
"""Save to HTML."""
30933106
if isinstance(filename, str):
@@ -3113,6 +3126,7 @@ def save_as_html(
31133126
html_head=html_head,
31143127
included_content_layers=included_content_layers,
31153128
split_page_view=split_page_view,
3129+
include_annotations=include_annotations,
31163130
)
31173131

31183132
with open(filename, "w", encoding="utf-8") as fw:
@@ -3165,6 +3179,7 @@ def export_to_html( # noqa: C901
31653179
html_head: str = "null", # should be deprecated ...
31663180
included_content_layers: Optional[set[ContentLayer]] = None,
31673181
split_page_view: bool = False,
3182+
include_annotations: bool = True,
31683183
) -> str:
31693184
r"""Serialize to HTML."""
31703185
from docling_core.transforms.serializer.html import (
@@ -3196,6 +3211,7 @@ def export_to_html( # noqa: C901
31963211
html_head=html_head,
31973212
html_lang=html_lang,
31983213
output_style=output_style,
3214+
include_annotations=include_annotations,
31993215
)
32003216

32013217
if html_head == "null":

0 commit comments

Comments
 (0)