Skip to content

Commit 28d166d

Browse files
feat: Update CVAT for multi-page annotation, utility to create sliced PDFs (#90)
* updated cli with viz and chunk parameters Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added capability to create pdf collection with sliding window Signed-off-by: Peter Staar <taa@zurich.ibm.com> * WIP: work on sliding window CVAT Signed-off-by: Peter Staar <taa@zurich.ibm.com> * CVAT tasks for sliding window are created correctly Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed the export of the overview Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed mypy Signed-off-by: Peter Staar <taa@zurich.ibm.com> * upgraded the visualization code Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed the cvat_dataset_builder to be able to do multipage Signed-off-by: Peter Staar <taa@zurich.ibm.com> * reformatted the code Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed the tests Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed mypy Signed-off-by: Peter Staar <taa@zurich.ibm.com> * first working version of the grouped images Signed-off-by: Peter Staar <taa@zurich.ibm.com> * reformatted the code Signed-off-by: Peter Staar <taa@zurich.ibm.com> * cleaning up some of the code, more to do ... Signed-off-by: Peter Staar <taa@zurich.ibm.com> * removed the picture-area to avoid image duplication Signed-off-by: Peter Staar <taa@zurich.ibm.com> * reformatted the code Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed the insertion Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed the nested lists Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed the nested lists (2) Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added ese tests for building datasets with CVAT Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added e2e tests for building datasets with CVAT (2) Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added e2e tests for building datasets with CVAT (3) Signed-off-by: Peter Staar <taa@zurich.ibm.com> * everything works as expected, now needs some clean up Signed-off-by: Peter Staar <taa@zurich.ibm.com> * Renamings for create_sliced_pdfs, small fixes Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Peter Staar <taa@zurich.ibm.com> Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
1 parent 4e01d0b commit 28d166d

23 files changed

+3475
-134
lines changed

docling_eval/cli/main.py

Lines changed: 70 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import glob
12
import json
23
import logging
34
import os
@@ -17,6 +18,7 @@
1718
from docling.document_converter import FormatOption, PdfFormatOption
1819
from docling.models.factories import get_ocr_factory
1920
from docling.pipeline.vlm_pipeline import VlmPipeline
21+
from PyPDF2 import PdfReader, PdfWriter
2022
from tabulate import tabulate # type: ignore
2123

2224
from docling_eval.datamodels.types import (
@@ -768,6 +770,56 @@ def visualize(
768770
_log.error(f"Unsupported modality for visualization: {modality}")
769771

770772

773+
@app.command()
774+
def create_sliced_pdfs(
775+
output_dir: Annotated[Path, typer.Option(help="Output directory")],
776+
source_dir: Annotated[Path, typer.Option(help="Dataset source path with PDFs")],
777+
slice_length: Annotated[int, typer.Option(help="sliding window")] = 1,
778+
num_overlap: Annotated[int, typer.Option(help="overlap window")] = 0,
779+
):
780+
"""Process multi-page pdf documents into chunks of slice_length with num_overlap overlapping pages in each slice."""
781+
output_dir.mkdir(parents=True, exist_ok=True)
782+
783+
if slice_length < 1:
784+
return ValueError("slice-length must be at least 1.")
785+
if num_overlap > slice_length - 1:
786+
return ValueError("num-overlap must be at most one less than slice-length")
787+
788+
num_overlap = max(num_overlap, 0)
789+
790+
pdf_paths = glob.glob(f"{source_dir}/**/*.pdf", recursive=True)
791+
_log.info(f"#-pdfs: {pdf_paths}")
792+
793+
for pdf_path in pdf_paths:
794+
base_name = os.path.basename(pdf_path).replace(".pdf", "")
795+
796+
try:
797+
with open(pdf_path, "rb") as pdf_file:
798+
reader = PdfReader(pdf_file)
799+
total_pages = len(reader.pages)
800+
801+
_log.info(f"Processing {pdf_path} ({total_pages} pages)")
802+
803+
for start_page in range(0, total_pages, slice_length - num_overlap):
804+
end_page = min(start_page + slice_length, total_pages)
805+
806+
# Create a new PDF with the pages in the current window
807+
writer = PdfWriter()
808+
809+
for page_num in range(start_page, end_page):
810+
writer.add_page(reader.pages[page_num])
811+
812+
# Save the new PDF
813+
output_path = os.path.join(
814+
output_dir, f"{base_name}_ps_{start_page}_pe_{end_page}.pdf"
815+
)
816+
with open(output_path, "wb") as output_file:
817+
writer.write(output_file)
818+
819+
except Exception as e:
820+
_log.error(f"Error processing {pdf_path}: {e}")
821+
822+
771823
@app.command()
772824
def create_cvat(
773825
output_dir: Annotated[Path, typer.Option(help="Output directory")],
@@ -798,6 +850,9 @@ def create_gt(
798850
int, typer.Option(help="End index (exclusive), -1 for all")
799851
] = -1,
800852
chunk_size: Annotated[int, typer.Option(help="chunk size")] = 80,
853+
do_visualization: Annotated[
854+
bool, typer.Option(help="visualize the predictions")
855+
] = True,
801856
):
802857
"""Create ground truth dataset only."""
803858
gt_dir = output_dir / "gt_dataset"
@@ -815,7 +870,9 @@ def create_gt(
815870
# Retrieve and save the dataset
816871
if dataset_builder.must_retrieve:
817872
dataset_builder.retrieve_input_dataset()
818-
dataset_builder.save_to_disk(chunk_size=chunk_size)
873+
dataset_builder.save_to_disk(
874+
chunk_size=chunk_size, do_visualization=do_visualization
875+
)
819876

820877
_log.info(f"Ground truth dataset created at {gt_dir}")
821878
except ValueError as e:
@@ -837,6 +894,7 @@ def create_eval(
837894
end_index: Annotated[
838895
int, typer.Option(help="End index (exclusive), -1 for all")
839896
] = -1,
897+
chunk_size: Annotated[int, typer.Option(help="chunk size")] = 80,
840898
# File provider required options
841899
file_prediction_format: Annotated[
842900
Optional[str],
@@ -856,6 +914,9 @@ def create_eval(
856914
help="Directory for local model artifacts. Will only be passed to providers supporting this."
857915
),
858916
] = None,
917+
do_visualization: Annotated[
918+
bool, typer.Option(help="visualize the predictions")
919+
] = True,
859920
):
860921
"""Create evaluation dataset from existing ground truth."""
861922
gt_dir = gt_dir or output_dir / "gt_dataset"
@@ -883,6 +944,7 @@ def create_eval(
883944
file_source_path=file_source_path,
884945
file_prediction_format=file_format,
885946
artifacts_path=artifacts_path,
947+
do_visualization=do_visualization,
886948
)
887949

888950
# Get the dataset name from the benchmark
@@ -896,6 +958,7 @@ def create_eval(
896958
split=split,
897959
begin_index=begin_index,
898960
end_index=end_index,
961+
chunk_size=chunk_size,
899962
)
900963

901964
_log.info(f"Evaluation dataset created at {pred_dir}")
@@ -926,6 +989,9 @@ def create(
926989
file_source_path: Annotated[
927990
Optional[Path], typer.Option(help="Source path for File provider")
928991
] = None,
992+
do_visualization: Annotated[
993+
bool, typer.Option(help="visualize the predictions")
994+
] = True,
929995
):
930996
"""Create both ground truth and evaluation datasets in one step."""
931997
# First create ground truth
@@ -937,6 +1003,7 @@ def create(
9371003
begin_index=begin_index,
9381004
end_index=end_index,
9391005
chunk_size=chunk_size,
1006+
do_visualization=do_visualization,
9401007
)
9411008

9421009
# Then create evaluation if provider specified
@@ -948,8 +1015,10 @@ def create(
9481015
split=split,
9491016
begin_index=begin_index,
9501017
end_index=end_index,
1018+
chunk_size=chunk_size,
9511019
file_prediction_format=file_prediction_format,
9521020
file_source_path=file_source_path,
1021+
do_visualization=do_visualization,
9531022
)
9541023
else:
9551024
_log.info(

docling_eval/datamodels/cvat_types.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,7 @@ def set_up_directory_structure(self, source: Path, target: Path) -> "BenchMarkDi
148148

149149
class AnnotationBBox(BaseModel):
150150

151+
page_no: int
151152
bbox_id: int
152153
bbox: BoundingBox
153154
label: DocItemLabel
@@ -189,6 +190,7 @@ class AnnotatedImage(BaseModel):
189190
page_img_files: List[Path] = []
190191
bbox_annotations: List[AnnotationBBox] = [] # Renamed from pred_boxes
191192
line_annotations: List[AnnotationLine] = [] # Renamed from pred_lines
193+
page_to_bbox: dict[int, BoundingBox] = {}
192194

193195
def to_cvat(self, lines: bool = False) -> str:
194196
"""

0 commit comments

Comments
 (0)