From d34ef6befb06654eddfbff822a5c07a830a87f01 Mon Sep 17 00:00:00 2001 From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com> Date: Mon, 30 Mar 2026 11:58:26 +0200 Subject: [PATCH 1/4] :sparkles: add support for crop & split extractions --- .github/workflows/cron.yml | 2 +- mindee/extraction/common/image_extractor.py | 8 ++- mindee/mindee_http/mindee_api_v2.py | 4 +- mindee/mindee_http/workflow_endpoint.py | 4 +- mindee/v2/__init__.py | 4 ++ mindee/v2/file_operations/__init__.py | 4 ++ mindee/v2/file_operations/crop.py | 69 +++++++++++++++++++++ mindee/v2/file_operations/split.py | 46 ++++++++++++++ mindee/v2/product/crop/crop_box.py | 13 ++++ mindee/v2/product/crop/crop_response.py | 14 +++++ mindee/v2/product/split/split_range.py | 13 ++++ mindee/v2/product/split/split_response.py | 14 +++++ tests/v2/test_client.py | 2 +- 13 files changed, 190 insertions(+), 7 deletions(-) create mode 100644 mindee/v2/file_operations/__init__.py create mode 100644 mindee/v2/file_operations/crop.py create mode 100644 mindee/v2/file_operations/split.py diff --git a/.github/workflows/cron.yml b/.github/workflows/cron.yml index 3fb7ea53..42ce2bb8 100644 --- a/.github/workflows/cron.yml +++ b/.github/workflows/cron.yml @@ -9,5 +9,5 @@ jobs: uses: mindee/mindee-api-python/.github/workflows/_test-regressions.yml@main secrets: inherit test-code-samples: - uses: mindee/mindee-api-python/.github/workflows/_smoke_test.yml@main + uses: mindee/mindee-api-python/.github/workflows/_smoke-test.yml@main secrets: inherit diff --git a/mindee/extraction/common/image_extractor.py b/mindee/extraction/common/image_extractor.py index 5bae6d37..32aeef66 100644 --- a/mindee/extraction/common/image_extractor.py +++ b/mindee/extraction/common/image_extractor.py @@ -1,5 +1,5 @@ import io -from typing import BinaryIO, List +from typing import BinaryIO, List, Union import pypdfium2 as pdfium from PIL import Image @@ -7,7 +7,7 @@ from mindee.error.mindee_error import MindeeError from mindee.extraction.common.extracted_image import ExtractedImage from mindee.geometry.point import Point -from mindee.geometry.polygon import get_min_max_x, get_min_max_y +from mindee.geometry.polygon import Polygon, get_min_max_x, get_min_max_y from mindee.input.sources.bytes_input import BytesInput from mindee.input.sources.local_input_source import LocalInputSource @@ -114,7 +114,9 @@ def get_file_extension(file_format: str): def extract_multiple_images_from_source( - input_source: LocalInputSource, page_id: int, polygons: List[List[Point]] + input_source: LocalInputSource, + page_id: int, + polygons: Union[List[Polygon], List[List[Point]]], ) -> List[ExtractedImage]: """ Extracts elements from a page based on a list of bounding boxes. diff --git a/mindee/mindee_http/mindee_api_v2.py b/mindee/mindee_http/mindee_api_v2.py index 446582ad..968bb813 100644 --- a/mindee/mindee_http/mindee_api_v2.py +++ b/mindee/mindee_http/mindee_api_v2.py @@ -4,7 +4,9 @@ import requests from mindee.error.mindee_error import MindeeApiV2Error -from mindee.input import LocalInputSource, UrlInputSource, BaseParameters +from mindee.input.base_parameters import BaseParameters +from mindee.input.sources.local_input_source import LocalInputSource +from mindee.input.sources.url_input_source import UrlInputSource from mindee.logger import logger from mindee.mindee_http.base_settings import USER_AGENT from mindee.mindee_http.settings_mixin import SettingsMixin diff --git a/mindee/mindee_http/workflow_endpoint.py b/mindee/mindee_http/workflow_endpoint.py index 7cd4cd2b..ed437d1e 100644 --- a/mindee/mindee_http/workflow_endpoint.py +++ b/mindee/mindee_http/workflow_endpoint.py @@ -2,7 +2,9 @@ import requests -from mindee.input import LocalInputSource, UrlInputSource, WorkflowOptions +from mindee.input.sources.local_input_source import LocalInputSource +from mindee.input.sources.url_input_source import UrlInputSource +from mindee.input.workflow_options import WorkflowOptions from mindee.mindee_http.base_endpoint import BaseEndpoint from mindee.mindee_http.workflow_settings import WorkflowSettings diff --git a/mindee/v2/__init__.py b/mindee/v2/__init__.py index c97f7080..81f408a4 100644 --- a/mindee/v2/__init__.py +++ b/mindee/v2/__init__.py @@ -1,3 +1,5 @@ +from mindee.v2.file_operations.split import Split +from mindee.v2.file_operations.crop import Crop from mindee.v2.product.classification.classification_parameters import ( ClassificationParameters, ) @@ -14,10 +16,12 @@ __all__ = [ "ClassificationResponse", "ClassificationParameters", + "Crop", "CropResponse", "CropParameters", "OCRResponse", "OCRParameters", + "Split", "SplitResponse", "SplitParameters", ] diff --git a/mindee/v2/file_operations/__init__.py b/mindee/v2/file_operations/__init__.py new file mode 100644 index 00000000..674d3aec --- /dev/null +++ b/mindee/v2/file_operations/__init__.py @@ -0,0 +1,4 @@ +from mindee.v2.file_operations.crop import Crop +from mindee.v2.file_operations.split import Split + +__all__ = ["Crop", "Split"] diff --git a/mindee/v2/file_operations/crop.py b/mindee/v2/file_operations/crop.py new file mode 100644 index 00000000..f77c9d22 --- /dev/null +++ b/mindee/v2/file_operations/crop.py @@ -0,0 +1,69 @@ +from typing import List + +from mindee.error import MindeeError +from mindee.extraction import ExtractedImage, extract_multiple_images_from_source +from mindee.geometry import Polygon +from mindee.input.sources.local_input_source import LocalInputSource +from mindee.parsing.v2.field import FieldLocation +from mindee.v2.product.crop.crop_box import CropBox + + +class Crop: + """Crop operations for V2.""" + + @classmethod + def extract_single_crop( + cls, input_source: LocalInputSource, crop: FieldLocation + ) -> ExtractedImage: + """ + Extracts a single crop as complete PDFs from the document. + + :param input_source: Local Input Source to extract sub-receipts from. + :param crop: Crop to extract. + :return: ExtractedImage. + """ + + return extract_multiple_images_from_source( + input_source, crop.page, [crop.polygon] + )[0] + + @classmethod + def extract_crops( + cls, input_source: LocalInputSource, crops: List[CropBox] + ) -> List[ExtractedImage]: + """ + Extracts individual receipts from multi-receipts documents. + + :param input_source: Local Input Source to extract sub-receipts from. + :param crops: List of crops. + :return: Individual extracted receipts as an array of ExtractedImage. + """ + images: List[ExtractedImage] = [] + if not crops: + raise MindeeError("No possible candidates found for Crop extraction.") + polygons: List[List[Polygon]] = [[] for _ in range(input_source.page_count)] + for i, crop in enumerate(crops): + polygons[crop.location.page].append(crop.location.polygon) + for i, polygon in enumerate(polygons): + images.extend( + extract_multiple_images_from_source( + input_source, + i, + polygon, + ) + ) + return images + + @classmethod + def apply( + cls, + input_source: LocalInputSource, + crops: List[CropBox], + ) -> List[ExtractedImage]: + """Crop a document into multiple pages. + + :param input_source: Input source to crop. + :param crops: List of crops. + """ + + return cls.extract_crops(input_source, crops) diff --git a/mindee/v2/file_operations/split.py b/mindee/v2/file_operations/split.py new file mode 100644 index 00000000..417883c4 --- /dev/null +++ b/mindee/v2/file_operations/split.py @@ -0,0 +1,46 @@ +from typing import List, Union + +from mindee.error import MindeeError +from mindee.extraction import ExtractedPdf, PdfExtractor +from mindee.input.sources.local_input_source import LocalInputSource +from mindee.v2.product.split.split_range import SplitRange + + +class Split: + """Split operations for V2.""" + + @classmethod + def extract_splits( + cls, + input_source: LocalInputSource, + splits: Union[List[SplitRange], List[List[int]]], + ) -> List[ExtractedPdf]: + """ + Extracts splits as complete PDFs from the document. + + :param input_source: Input source to split. + :param splits: List of sub-lists of pages to keep. + :return: A list of extracted invoices. + """ + pdf_extractor = PdfExtractor(input_source) + page_groups = [] + for split in splits: + if isinstance(split, SplitRange): + page_groups.append(split.page_range) + else: + page_groups.append(split) + if len(splits) < 1: + raise MindeeError("No indexes provided.") + return pdf_extractor.extract_sub_documents(page_groups) + + @classmethod + def apply( + cls, input_source: LocalInputSource, splits: List[SplitRange] + ) -> List[ExtractedPdf]: + """Split a document into multiple pages. + + :param input_source: Input source to split. + :param splits: List of splits. + """ + + return cls.extract_splits(input_source, splits) diff --git a/mindee/v2/product/crop/crop_box.py b/mindee/v2/product/crop/crop_box.py index 62a32840..191b5d88 100644 --- a/mindee/v2/product/crop/crop_box.py +++ b/mindee/v2/product/crop/crop_box.py @@ -1,3 +1,5 @@ +from mindee.extraction import ExtractedImage, extract_multiple_images_from_source +from mindee.input.sources.local_input_source import LocalInputSource from mindee.parsing.common.string_dict import StringDict from mindee.parsing.v2.field.field_location import FieldLocation @@ -16,3 +18,14 @@ def __init__(self, server_response: StringDict): def __str__(self) -> str: return f"* :Location: {self.location}\n :Object Type: {self.object_type}" + + def apply_to_file(self, input_source: LocalInputSource) -> ExtractedImage: + """ + Apply the split range inference to a file and return a single extracted PDF. + + :param input_source: Local file to apply the inference to + :return: Extracted PDF + """ + return extract_multiple_images_from_source( + input_source, self.location.page, [self.location.polygon] + )[0] diff --git a/mindee/v2/product/crop/crop_response.py b/mindee/v2/product/crop/crop_response.py index 03b4be0e..f60f0992 100644 --- a/mindee/v2/product/crop/crop_response.py +++ b/mindee/v2/product/crop/crop_response.py @@ -1,4 +1,9 @@ +from typing import List + +from mindee.extraction.common.extracted_image import ExtractedImage +from mindee.input.sources.local_input_source import LocalInputSource from mindee.parsing.common.string_dict import StringDict +from mindee.v2.file_operations.crop import Crop from mindee.v2.parsing.inference import BaseResponse from mindee.v2.product.crop.crop_inference import CropInference @@ -15,3 +20,12 @@ class CropResponse(BaseResponse): def __init__(self, raw_response: StringDict) -> None: super().__init__(raw_response) self.inference = CropInference(raw_response["inference"]) + + def apply_to_file(self, input_source: LocalInputSource) -> List[ExtractedImage]: + """ + Apply the crop inference to a file and return a list of extracted images. + + :param input_source: Local file to apply the inference to + :return: List of extracted PDFs + """ + return Crop.extract_crops(input_source, self.inference.result.crops) diff --git a/mindee/v2/product/split/split_range.py b/mindee/v2/product/split/split_range.py index e0e70110..4167290e 100644 --- a/mindee/v2/product/split/split_range.py +++ b/mindee/v2/product/split/split_range.py @@ -1,5 +1,8 @@ from typing import List +from mindee.extraction.pdf_extractor.extracted_pdf import ExtractedPdf +from mindee.extraction.pdf_extractor.pdf_extractor import PdfExtractor +from mindee.input.sources.local_input_source import LocalInputSource from mindee.parsing.common.string_dict import StringDict @@ -21,3 +24,13 @@ def __init__(self, server_response: StringDict): def __str__(self) -> str: page_range = ",".join([str(page_index) for page_index in self.page_range]) return f"* :Page Range: {page_range}\n :Document Type: {self.document_type}" + + def apply_to_file(self, input_source: LocalInputSource) -> ExtractedPdf: + """ + Apply the split range inference to a file and return a single extracted PDF. + + :param input_source: Local file to apply the inference to + :return: Extracted PDF + """ + pdf_extractor = PdfExtractor(input_source) + return pdf_extractor.extract_sub_documents([self.page_range])[0] diff --git a/mindee/v2/product/split/split_response.py b/mindee/v2/product/split/split_response.py index ec5112ef..4bfec65a 100644 --- a/mindee/v2/product/split/split_response.py +++ b/mindee/v2/product/split/split_response.py @@ -1,4 +1,9 @@ +from typing import List + +from mindee.extraction import ExtractedPdf +from mindee.input.sources.local_input_source import LocalInputSource from mindee.parsing.common.string_dict import StringDict +from mindee.v2.file_operations.split import Split from mindee.v2.parsing.inference import BaseResponse from mindee.v2.product.split.split_inference import SplitInference @@ -15,3 +20,12 @@ class SplitResponse(BaseResponse): def __init__(self, raw_response: StringDict) -> None: super().__init__(raw_response) self.inference = SplitInference(raw_response["inference"]) + + def apply_to_file(self, input_source: LocalInputSource) -> List[ExtractedPdf]: + """ + Apply the split inference to a file and return a list of extracted PDFs. + + :param input_source: Local file to apply the inference to + :return: List of extracted PDFs + """ + return Split.extract_splits(input_source, self.inference.result.splits) diff --git a/tests/v2/test_client.py b/tests/v2/test_client.py index a4e87685..4423cdec 100644 --- a/tests/v2/test_client.py +++ b/tests/v2/test_client.py @@ -6,7 +6,7 @@ from mindee import ClientV2, InferenceParameters, InferenceResponse, LocalResponse from mindee.error.mindee_error import MindeeApiV2Error, MindeeError from mindee.error.mindee_http_error_v2 import MindeeHTTPErrorV2 -from mindee.input import LocalInputSource, PathInput +from mindee.input.sources.local_input_source import LocalInputSource, PathInput from mindee.mindee_http.base_settings import USER_AGENT from mindee.parsing.v2.inference import Inference from mindee.parsing.v2.job import Job From 9c393e631b75c7c5fbb0702e2c976c86cf16abda Mon Sep 17 00:00:00 2001 From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com> Date: Mon, 30 Mar 2026 14:10:00 +0200 Subject: [PATCH 2/4] add tests --- .../auto_multi_receipts_extraction_example.py | 4 +- mindee/extraction/common/extracted_image.py | 20 ++++-- .../extraction/pdf_extractor/extracted_pdf.py | 5 ++ mindee/v2/file_operations/split.py | 7 +- mindee/v2/product/crop/crop_response.py | 25 +++++++- tests/data | 2 +- tests/input/test_compression.py | 52 +++++++-------- tests/utils.py | 9 +++ tests/v1/extraction/test_image_extractor.py | 6 +- .../test_url_input_source_integration.py | 11 +--- tests/v2/file_operations/__init__.py | 0 .../v2/file_operations/test_crop_operation.py | 64 +++++++++++++++++++ .../test_crop_operation_integration.py | 62 ++++++++++++++++++ .../file_operations/test_split_operation.py | 56 ++++++++++++++++ .../test_split_operation_integration.py | 62 ++++++++++++++++++ tests/v2/product/crop/test_crop_response.py | 2 +- tests/v2/product/split/test_split_response.py | 4 +- tests/v2/test_client.py | 3 +- 18 files changed, 343 insertions(+), 51 deletions(-) create mode 100644 tests/v2/file_operations/__init__.py create mode 100644 tests/v2/file_operations/test_crop_operation.py create mode 100644 tests/v2/file_operations/test_crop_operation_integration.py create mode 100644 tests/v2/file_operations/test_split_operation.py create mode 100644 tests/v2/file_operations/test_split_operation_integration.py diff --git a/examples/auto_multi_receipts_extraction_example.py b/examples/auto_multi_receipts_extraction_example.py index aa995f51..2906a155 100644 --- a/examples/auto_multi_receipts_extraction_example.py +++ b/examples/auto_multi_receipts_extraction_example.py @@ -16,7 +16,9 @@ def parse_receipts(input_path): extracted_receipts = extract_receipts(input_doc, result_split.document.inference) for idx, receipt in enumerate(extracted_receipts, 1): - result_receipt = mindee_client.parse(product.ReceiptV5, receipt.as_source()) + result_receipt = mindee_client.parse( + product.ReceiptV5, receipt.as_input_source() + ) print(f"Receipt {idx}:") print(result_receipt.document) print("-" * 40) diff --git a/mindee/extraction/common/extracted_image.py b/mindee/extraction/common/extracted_image.py index e4013246..8fbb57d7 100644 --- a/mindee/extraction/common/extracted_image.py +++ b/mindee/extraction/common/extracted_image.py @@ -17,6 +17,8 @@ class ExtractedImage: """Id of the page the image was extracted from.""" _element_id: int """Id of the element on a given page.""" + filename: str + """Name of the file the image was extracted from.""" def __init__( self, input_source: LocalInputSource, page_id: int, element_id: int @@ -30,6 +32,7 @@ def __init__( """ self.buffer = io.BytesIO(input_source.file_object.read()) self.buffer.name = input_source.filename + self.filename = input_source.filename if input_source.is_pdf(): extension = "jpg" else: @@ -56,20 +59,27 @@ def save_to_file(self, output_path: str, file_format: Optional[str] = None): if not file_format: if len(resolved_path.suffix) < 1: raise ValueError("Invalid file format.") - file_format = ( - resolved_path.suffix.upper() - ) # technically redundant since PIL applies an upper operation - # to the parameter , but older versions may not do so. + # Let PIL infer format from filename extension self.buffer.seek(0) image = Image.open(self.buffer) - image.save(resolved_path, format=file_format) + if file_format: + image.save(resolved_path, format=file_format) + else: + image.save(resolved_path) logger.info("File saved successfully to '%s'.", resolved_path) except TypeError as exc: raise MindeeError("Invalid path/filename provided.") from exc except Exception as exc: + print(exc) raise MindeeError(f"Could not save file {Path(output_path).name}.") from exc def as_source(self) -> FileInput: + """ + Deprecated. Use ``as_input_source`` instead. + """ + return self.as_input_source() + + def as_input_source(self) -> FileInput: """ Return the file as a Mindee-compatible BufferInput source. diff --git a/mindee/extraction/pdf_extractor/extracted_pdf.py b/mindee/extraction/pdf_extractor/extracted_pdf.py index 0e3dcb8d..eab00a1c 100644 --- a/mindee/extraction/pdf_extractor/extracted_pdf.py +++ b/mindee/extraction/pdf_extractor/extracted_pdf.py @@ -28,6 +28,10 @@ def get_page_count(self) -> int: ) from exc def write_to_file(self, output_path: str): + """Deprecated. Use ``save_to_file`` instead.""" + self.save_to_file(output_path) + + def save_to_file(self, output_path: str): """ Writes the contents of the current PDF object to a file. @@ -40,6 +44,7 @@ def write_to_file(self, output_path: str): raise MindeeError("Invalid save path provided {}.") if out_path.suffix.lower() != "pdf": out_path = out_path.parent / (out_path.stem + "." + "pdf") + self.pdf_bytes.seek(0) with open(out_path, "wb") as out_file: out_file.write(self.pdf_bytes.read()) diff --git a/mindee/v2/file_operations/split.py b/mindee/v2/file_operations/split.py index 417883c4..469451e0 100644 --- a/mindee/v2/file_operations/split.py +++ b/mindee/v2/file_operations/split.py @@ -26,9 +26,12 @@ def extract_splits( page_groups = [] for split in splits: if isinstance(split, SplitRange): - page_groups.append(split.page_range) + lower_bound = split.page_range[0] + upper_bound = split.page_range[1] else: - page_groups.append(split) + lower_bound = split[0] + upper_bound = split[1] + page_groups.append(list(range(lower_bound, upper_bound + 1))) if len(splits) < 1: raise MindeeError("No indexes provided.") return pdf_extractor.extract_sub_documents(page_groups) diff --git a/mindee/v2/product/crop/crop_response.py b/mindee/v2/product/crop/crop_response.py index f60f0992..74094633 100644 --- a/mindee/v2/product/crop/crop_response.py +++ b/mindee/v2/product/crop/crop_response.py @@ -1,9 +1,11 @@ from typing import List +from mindee.error import MindeeError from mindee.extraction.common.extracted_image import ExtractedImage +from mindee.extraction.common.image_extractor import extract_multiple_images_from_source +from mindee.geometry import Polygon from mindee.input.sources.local_input_source import LocalInputSource from mindee.parsing.common.string_dict import StringDict -from mindee.v2.file_operations.crop import Crop from mindee.v2.parsing.inference import BaseResponse from mindee.v2.product.crop.crop_inference import CropInference @@ -28,4 +30,23 @@ def apply_to_file(self, input_source: LocalInputSource) -> List[ExtractedImage]: :param input_source: Local file to apply the inference to :return: List of extracted PDFs """ - return Crop.extract_crops(input_source, self.inference.result.crops) + crops = self.inference.result.crops + if not crops: + raise MindeeError("No possible candidates found for Crop extraction.") + + polygons: List[List[Polygon]] = [[] for _ in range(input_source.page_count)] + for crop in crops: + polygons[crop.location.page].append(crop.location.polygon) + + images: List[ExtractedImage] = [] + for page_index, page_polygons in enumerate(polygons): + if not page_polygons: + continue + images.extend( + extract_multiple_images_from_source( + input_source, + page_index, + page_polygons, + ) + ) + return images diff --git a/tests/data b/tests/data index c2e36f5b..53f0efbc 160000 --- a/tests/data +++ b/tests/data @@ -1 +1 @@ -Subproject commit c2e36f5b635386cb9bb922b517c4e02039b0a122 +Subproject commit 53f0efbc08c77c2c085aadd27de9d2d6c359276e diff --git a/tests/input/test_compression.py b/tests/input/test_compression.py index b3b7c767..fde5d064 100644 --- a/tests/input/test_compression.py +++ b/tests/input/test_compression.py @@ -9,9 +9,14 @@ from mindee.input.sources import PathInput from mindee.pdf.pdf_compressor import compress_pdf from mindee.pdf.pdf_utils import extract_text_from_pdf -from tests.utils import FILE_TYPES_DIR, ROOT_DATA_DIR, V1_DATA_DIR, V1_PRODUCT_DATA_DIR +from tests.utils import ( + FILE_TYPES_DIR, + OUTPUT_DIR, + V1_DATA_DIR, + V1_PRODUCT_DATA_DIR, + cleanup_output_files, +) -OUTPUT_DIR = ROOT_DATA_DIR / "output" RECEIPT_PATH = FILE_TYPES_DIR / "receipt.jpg" @@ -202,26 +207,23 @@ def test_pdf_compress_with_text_does_not_compress(): @pytest.fixture(scope="module", autouse=True) def cleanup(): yield - created_files = [ - "compress10.pdf", - "compress50.pdf", - "compress75.pdf", - "compress85.pdf", - "resize_indirect.pdf", - "compress1.jpg", - "compress10.jpg", - "compress50.jpg", - "compress75.jpg", - "compress100.jpg", - "compress_indirect.jpg", - "resize250x500.jpg", - "resize500x250.jpg", - "resize500xnull.jpg", - "resize_indirect.jpg", - "resizenullx250.jpg", - ] - - for file_path in created_files: - full_path = OUTPUT_DIR / file_path - if full_path.exists(): - os.remove(full_path) + cleanup_output_files( + [ + "compress10.pdf", + "compress50.pdf", + "compress75.pdf", + "compress85.pdf", + "resize_indirect.pdf", + "compress1.jpg", + "compress10.jpg", + "compress50.jpg", + "compress75.jpg", + "compress100.jpg", + "compress_indirect.jpg", + "resize250x500.jpg", + "resize500x250.jpg", + "resize500xnull.jpg", + "resize_indirect.jpg", + "resizenullx250.jpg", + ] + ) diff --git a/tests/utils.py b/tests/utils.py index 252a699c..79948522 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,3 +1,4 @@ +import os from difflib import SequenceMatcher from pathlib import Path @@ -9,6 +10,7 @@ ROOT_DATA_DIR = Path(__file__).parent / "data" FILE_TYPES_DIR = ROOT_DATA_DIR / "file_types" +OUTPUT_DIR = ROOT_DATA_DIR / "output" V1_DATA_DIR = ROOT_DATA_DIR / "v1" V1_ERROR_DATA_DIR = V1_DATA_DIR / "errors" @@ -44,3 +46,10 @@ def levenshtein_ratio(ref_str: str, target_str: str) -> float: :return: Ratio between the two strings """ return SequenceMatcher(None, ref_str, target_str).ratio() + + +def cleanup_output_files(created_files): + for file_path in created_files: + full_path = OUTPUT_DIR / file_path + if full_path.exists(): + os.remove(full_path) diff --git a/tests/v1/extraction/test_image_extractor.py b/tests/v1/extraction/test_image_extractor.py index 6416802c..87147ca3 100644 --- a/tests/v1/extraction/test_image_extractor.py +++ b/tests/v1/extraction/test_image_extractor.py @@ -35,9 +35,9 @@ def test_barcode_image_extraction(barcode_path, barcode_json_path): assert len(extracted_barcodes_1d) == 1 assert len(extracted_barcodes_2d) == 2 - assert extracted_barcodes_1d[0].as_source().filename.endswith("jpg") + assert extracted_barcodes_1d[0].as_input_source().filename.endswith("jpg") assert Image.open(extracted_barcodes_1d[0].buffer).size == (353, 200) assert Image.open(extracted_barcodes_2d[0].buffer).size == (214, 216) - assert extracted_barcodes_2d[0].as_source().filename.endswith("jpg") - assert extracted_barcodes_2d[1].as_source().filename.endswith("jpg") + assert extracted_barcodes_2d[0].as_input_source().filename.endswith("jpg") + assert extracted_barcodes_2d[1].as_input_source().filename.endswith("jpg") assert Image.open(extracted_barcodes_2d[1].buffer).size == (193, 201) diff --git a/tests/v1/input/test_url_input_source_integration.py b/tests/v1/input/test_url_input_source_integration.py index 6d6616bd..82c81b11 100644 --- a/tests/v1/input/test_url_input_source_integration.py +++ b/tests/v1/input/test_url_input_source_integration.py @@ -5,6 +5,7 @@ from mindee import Client from mindee.product.invoice import InvoiceV4 +from tests.utils import cleanup_output_files @pytest.fixture @@ -55,14 +56,8 @@ def test_save_file_with_filename(client, reference_file_path, output_file_path): @pytest.fixture(autouse=True) -def cleanup(request, output_file_path: Path): +def cleanup(request): def remove_test_files(): - generated_files = [ - Path.resolve(output_file_path / "invoice_5p.pdf"), - Path.resolve(output_file_path / "customFileName.pdf"), - ] - for filepath in generated_files: - if os.path.exists(filepath): - os.remove(filepath) + cleanup_output_files(["invoice_5p.pdf", "customFileName.pdf"]) request.addfinalizer(remove_test_files) diff --git a/tests/v2/file_operations/__init__.py b/tests/v2/file_operations/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/v2/file_operations/test_crop_operation.py b/tests/v2/file_operations/test_crop_operation.py new file mode 100644 index 00000000..85a12367 --- /dev/null +++ b/tests/v2/file_operations/test_crop_operation.py @@ -0,0 +1,64 @@ +import json + +import pytest +from PIL import Image + +from mindee.v2.file_operations.crop import Crop +from mindee.input.sources.path_input import PathInput +from mindee.v2.product.crop.crop_response import ( + CropResponse, +) +from tests.utils import V2_PRODUCT_DATA_DIR + + +@pytest.fixture +def crops_single_page_path(): + return V2_PRODUCT_DATA_DIR / "crop" / "default_sample.jpg" + + +@pytest.fixture +def crops_multi_page_path(): + return V2_PRODUCT_DATA_DIR / "crop" / "multipage_sample.pdf" + + +@pytest.fixture +def crops_single_page_json_path(): + return V2_PRODUCT_DATA_DIR / "crop" / "crop_single.json" + + +@pytest.fixture +def crops_multi_page_json_path(): + return V2_PRODUCT_DATA_DIR / "crop" / "crop_multiple.json" + + +def test_single_page_crop_split(crops_single_page_path, crops_single_page_json_path): + input_sample = PathInput(crops_single_page_path) + with open(crops_single_page_json_path, "rb") as f: + response = json.load(f) + doc = CropResponse(response) + extracted_crops = Crop.extract_crops(input_sample, doc.inference.result.crops) + assert len(extracted_crops) == 1 + + assert extracted_crops[0].page_id == 0 + assert extracted_crops[0].element_id == 0 + image_buffer_0 = Image.open(extracted_crops[0].buffer) + assert image_buffer_0.size == (2823, 1571) + + +def test_multi_page_receipt_split(crops_multi_page_path, crops_multi_page_json_path): + input_sample = PathInput(crops_multi_page_path) + with open(crops_multi_page_json_path, "rb") as f: + response = json.load(f) + doc = CropResponse(response) + extracted_crops = Crop.extract_crops(input_sample, doc.inference.result.crops) + assert len(extracted_crops) == 2 + + assert extracted_crops[0].page_id == 0 + assert extracted_crops[0].element_id == 0 + image_buffer_0 = Image.open(extracted_crops[0].buffer) + assert image_buffer_0.size == (156, 758) + + assert extracted_crops[1].page_id == 0 + assert extracted_crops[1].element_id == 1 + image_buffer_1 = Image.open(extracted_crops[1].buffer) + assert image_buffer_1.size == (187, 690) diff --git a/tests/v2/file_operations/test_crop_operation_integration.py b/tests/v2/file_operations/test_crop_operation_integration.py new file mode 100644 index 00000000..3a5b8dbe --- /dev/null +++ b/tests/v2/file_operations/test_crop_operation_integration.py @@ -0,0 +1,62 @@ +import os +from os import getenv + +import pytest + +from mindee import ( + ClientV2, + InferenceParameters, + InferenceResponse, + CropParameters, + CropResponse, +) +from mindee.input.sources.path_input import PathInput +from mindee.v2 import Crop +from tests.utils import OUTPUT_DIR, V2_PRODUCT_DATA_DIR, cleanup_output_files + + +@pytest.fixture +def crop_sample(): + return V2_PRODUCT_DATA_DIR / "crop" / "default_sample.jpg" + + +def check_findoc_return(findoc_response: InferenceResponse): + assert len(findoc_response.inference.model.id) > 0 + assert findoc_response.inference.result.fields.get("total_amount").value > 0 + + +@pytest.mark.integration +def test_image_should_extract_crops(): + client = ClientV2() + crop_input = PathInput(V2_PRODUCT_DATA_DIR / "crop" / "default_sample.jpg") + response = client.enqueue_and_get_result( + CropResponse, + crop_input, + CropParameters(getenv("MINDEE_V2_SE_TESTS_CROP_MODEL_ID"), close_file=False), + ) + assert len(response.inference.result.crops) == 2 + + extracted_images = Crop.extract_crops(crop_input, response.inference.result.crops) + + assert len(extracted_images) == 2 + assert extracted_images[0].filename == "default_sample.jpg_page1-0.jpg" + assert extracted_images[1].filename == "default_sample.jpg_page1-1.jpg" + + invoice_0 = client.enqueue_and_get_result( + InferenceResponse, + extracted_images[0].as_input_source(), + InferenceParameters( + getenv("MINDEE_V2_SE_TESTS_FINDOC_MODEL_ID"), close_file=False + ), + ) + check_findoc_return(invoice_0) + for i, extracted_image in enumerate(extracted_images): + extracted_image.save_to_file(OUTPUT_DIR / f"crop_{i + 1:03d}.jpg") + assert os.path.getsize(OUTPUT_DIR / "crop_001.jpg") == 198887 + assert os.path.getsize(OUTPUT_DIR / "crop_002.jpg") == 197443 + + +@pytest.fixture(scope="module", autouse=True) +def cleanup(): + yield + cleanup_output_files(["crop_001.jpg", "crop_002.jpg"]) diff --git a/tests/v2/file_operations/test_split_operation.py b/tests/v2/file_operations/test_split_operation.py new file mode 100644 index 00000000..dc873f4f --- /dev/null +++ b/tests/v2/file_operations/test_split_operation.py @@ -0,0 +1,56 @@ +import json + +import pytest + +from mindee.v2.file_operations.split import Split +from mindee.input.sources.path_input import PathInput +from mindee.v2.product.split.split_response import ( + SplitResponse, +) +from tests.utils import V2_PRODUCT_DATA_DIR + + +@pytest.fixture +def splits_default(): + return ( + V2_PRODUCT_DATA_DIR / "extraction" / "financial_document" / "default_sample.jpg" + ) + + +@pytest.fixture +def splits_5p(): + return V2_PRODUCT_DATA_DIR / "split" / "invoice_5p.pdf" + + +@pytest.fixture +def splits_single_page_json_path(): + return V2_PRODUCT_DATA_DIR / "split" / "split_single.json" + + +@pytest.fixture +def splits_multi_page_json_path(): + return V2_PRODUCT_DATA_DIR / "split" / "split_multiple.json" + + +def test_single_page_split_split(splits_default, splits_single_page_json_path): + input_sample = PathInput(splits_default) + with open(splits_single_page_json_path, "rb") as f: + response = json.load(f) + doc = SplitResponse(response) + extracted_splits = Split.extract_splits(input_sample, doc.inference.result.splits) + assert len(extracted_splits) == 1 + + assert extracted_splits[0].get_page_count() == 1 + + +def test_multi_page_receipt_split(splits_5p, splits_multi_page_json_path): + input_sample = PathInput(splits_5p) + with open(splits_multi_page_json_path, "rb") as f: + response = json.load(f) + doc = SplitResponse(response) + extracted_splits = Split.extract_splits(input_sample, doc.inference.result.splits) + assert len(extracted_splits) == 3 + + assert extracted_splits[0].get_page_count() == 1 + assert extracted_splits[1].get_page_count() == 3 + assert extracted_splits[2].get_page_count() == 1 diff --git a/tests/v2/file_operations/test_split_operation_integration.py b/tests/v2/file_operations/test_split_operation_integration.py new file mode 100644 index 00000000..d3cbce89 --- /dev/null +++ b/tests/v2/file_operations/test_split_operation_integration.py @@ -0,0 +1,62 @@ +from os import getenv + +import pytest + +from mindee import ( + ClientV2, + InferenceParameters, + InferenceResponse, + SplitParameters, + SplitResponse, +) +from mindee.input.sources.path_input import PathInput +from mindee.v2 import Split +from tests.utils import OUTPUT_DIR, V2_PRODUCT_DATA_DIR, cleanup_output_files + + +@pytest.fixture +def invoice_splitter_5p_path(): + return V2_PRODUCT_DATA_DIR / "split" / "invoice_5p.pdf" + + +def check_findoc_return(findoc_response: InferenceResponse): + assert len(findoc_response.inference.model.id) > 0 + assert findoc_response.inference.result.fields.get("total_amount").value > 0 + + +@pytest.mark.integration +def test_pdf_should_extract_splits(): + client = ClientV2() + split_input = PathInput(V2_PRODUCT_DATA_DIR / "split" / "default_sample.pdf") + response = client.enqueue_and_get_result( + SplitResponse, + split_input, + SplitParameters(getenv("MINDEE_V2_SE_TESTS_SPLIT_MODEL_ID"), close_file=False), + ) + assert response.inference.file.page_count == 2 + + extracted_pdfs = Split.extract_splits(split_input, response.inference.result.splits) + + assert len(extracted_pdfs) == 2 + assert extracted_pdfs[0].filename == "default_sample_001-001.pdf" + assert extracted_pdfs[1].filename == "default_sample_002-002.pdf" + + invoice_0 = client.enqueue_and_get_result( + InferenceResponse, + extracted_pdfs[0].as_input_source(), + InferenceParameters( + getenv("MINDEE_V2_SE_TESTS_FINDOC_MODEL_ID"), close_file=False + ), + ) + check_findoc_return(invoice_0) + for i, extracted_pdf in enumerate(extracted_pdfs): + extracted_pdf.save_to_file(OUTPUT_DIR / f"split_{i + 1:03d}.pdf") + for i in range(len(extracted_pdfs)): + local_input = PathInput(OUTPUT_DIR / f"split_{i + 1:03d}.pdf") + assert local_input.page_count == extracted_pdfs[i].get_page_count() + + +@pytest.fixture(scope="module", autouse=True) +def cleanup(): + yield + cleanup_output_files(["split_001.pdf", "split_002.pdf"]) diff --git a/tests/v2/product/crop/test_crop_response.py b/tests/v2/product/crop/test_crop_response.py index 635a89ea..98832745 100644 --- a/tests/v2/product/crop/test_crop_response.py +++ b/tests/v2/product/crop/test_crop_response.py @@ -64,6 +64,6 @@ def test_crop_multiple(): assert response.inference.result.crops[1].location.polygon[3][0] == 0.547 assert response.inference.result.crops[1].location.polygon[3][1] == 0.97 assert response.inference.result.crops[1].location.page == 0 - assert response.inference.result.crops[1].object_type == "invoice" + assert response.inference.result.crops[1].object_type == "receipt" assert rst_sample == str(response) diff --git a/tests/v2/product/split/test_split_response.py b/tests/v2/product/split/test_split_response.py index 4ce2ad8b..29381469 100644 --- a/tests/v2/product/split/test_split_response.py +++ b/tests/v2/product/split/test_split_response.py @@ -32,7 +32,7 @@ def test_split_multiple(): assert len(response.inference.result.splits[0].page_range) == 2 assert response.inference.result.splits[0].page_range[0] == 0 assert response.inference.result.splits[0].page_range[1] == 0 - assert response.inference.result.splits[0].document_type == "invoice" + assert response.inference.result.splits[0].document_type == "passport" assert len(response.inference.result.splits[1].page_range) == 2 assert response.inference.result.splits[1].page_range[0] == 1 @@ -42,4 +42,4 @@ def test_split_multiple(): assert len(response.inference.result.splits[2].page_range) == 2 assert response.inference.result.splits[2].page_range[0] == 4 assert response.inference.result.splits[2].page_range[1] == 4 - assert response.inference.result.splits[2].document_type == "invoice" + assert response.inference.result.splits[2].document_type == "receipt" diff --git a/tests/v2/test_client.py b/tests/v2/test_client.py index 4423cdec..5eb0d3fa 100644 --- a/tests/v2/test_client.py +++ b/tests/v2/test_client.py @@ -6,7 +6,8 @@ from mindee import ClientV2, InferenceParameters, InferenceResponse, LocalResponse from mindee.error.mindee_error import MindeeApiV2Error, MindeeError from mindee.error.mindee_http_error_v2 import MindeeHTTPErrorV2 -from mindee.input.sources.local_input_source import LocalInputSource, PathInput +from mindee.input.sources.local_input_source import LocalInputSource +from mindee.input.sources.path_input import PathInput from mindee.mindee_http.base_settings import USER_AGENT from mindee.parsing.v2.inference import Inference from mindee.parsing.v2.job import Job From cdfd59d28551504a9fa7c5c5b5d6c54e0a002d7e Mon Sep 17 00:00:00 2001 From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com> Date: Mon, 30 Mar 2026 14:25:07 +0200 Subject: [PATCH 3/4] fix test for windows --- .../test_split_operation_integration.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/v2/file_operations/test_split_operation_integration.py b/tests/v2/file_operations/test_split_operation_integration.py index d3cbce89..e37a83e7 100644 --- a/tests/v2/file_operations/test_split_operation_integration.py +++ b/tests/v2/file_operations/test_split_operation_integration.py @@ -31,7 +31,10 @@ def test_pdf_should_extract_splits(): response = client.enqueue_and_get_result( SplitResponse, split_input, - SplitParameters(getenv("MINDEE_V2_SE_TESTS_SPLIT_MODEL_ID"), close_file=False), + SplitParameters( + getenv("MINDEE_V2_SE_TESTS_SPLIT_MODEL_ID"), + close_file=False, + ), ) assert response.inference.file.page_count == 2 @@ -53,7 +56,11 @@ def test_pdf_should_extract_splits(): extracted_pdf.save_to_file(OUTPUT_DIR / f"split_{i + 1:03d}.pdf") for i in range(len(extracted_pdfs)): local_input = PathInput(OUTPUT_DIR / f"split_{i + 1:03d}.pdf") - assert local_input.page_count == extracted_pdfs[i].get_page_count() + try: + assert local_input.page_count == extracted_pdfs[i].get_page_count() + finally: + local_input.close() + split_input.close() @pytest.fixture(scope="module", autouse=True) From 9ba47a9483b56844906b85ccb50d98f345248ee8 Mon Sep 17 00:00:00 2001 From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com> Date: Mon, 30 Mar 2026 15:42:52 +0200 Subject: [PATCH 4/4] apply fixes --- mindee/extraction/common/extracted_image.py | 6 +- mindee/extraction/common/image_extractor.py | 2 +- .../extraction/pdf_extractor/extracted_pdf.py | 4 +- mindee/v2/__init__.py | 13 ++- mindee/v2/file_operations/__init__.py | 9 +- mindee/v2/file_operations/crop.py | 102 ++++++++---------- mindee/v2/file_operations/crop_files.py | 20 ++++ mindee/v2/file_operations/split.py | 68 +++++------- mindee/v2/file_operations/split_files.py | 20 ++++ mindee/v2/product/crop/crop_box.py | 2 +- mindee/v2/product/crop/crop_response.py | 35 ++---- mindee/v2/product/split/split_range.py | 2 +- mindee/v2/product/split/split_response.py | 14 +-- .../v2/file_operations/test_crop_operation.py | 6 +- .../test_crop_operation_integration.py | 7 +- .../file_operations/test_split_operation.py | 6 +- .../test_split_operation_integration.py | 7 +- 17 files changed, 160 insertions(+), 163 deletions(-) create mode 100644 mindee/v2/file_operations/crop_files.py create mode 100644 mindee/v2/file_operations/split_files.py diff --git a/mindee/extraction/common/extracted_image.py b/mindee/extraction/common/extracted_image.py index 8fbb57d7..66f7631a 100644 --- a/mindee/extraction/common/extracted_image.py +++ b/mindee/extraction/common/extracted_image.py @@ -1,6 +1,6 @@ import io from pathlib import Path -from typing import Optional +from typing import Optional, Union from PIL import Image @@ -46,7 +46,9 @@ def __init__( self._page_id = page_id self._element_id = 0 if element_id is None else element_id - def save_to_file(self, output_path: str, file_format: Optional[str] = None): + def save_to_file( + self, output_path: Union[Path, str], file_format: Optional[str] = None + ): """ Saves the document to a file. diff --git a/mindee/extraction/common/image_extractor.py b/mindee/extraction/common/image_extractor.py index 32aeef66..52c57d0d 100644 --- a/mindee/extraction/common/image_extractor.py +++ b/mindee/extraction/common/image_extractor.py @@ -116,7 +116,7 @@ def get_file_extension(file_format: str): def extract_multiple_images_from_source( input_source: LocalInputSource, page_id: int, - polygons: Union[List[Polygon], List[List[Point]]], + polygons: List[Union[Polygon, List[Point]]], ) -> List[ExtractedImage]: """ Extracts elements from a page based on a list of bounding boxes. diff --git a/mindee/extraction/pdf_extractor/extracted_pdf.py b/mindee/extraction/pdf_extractor/extracted_pdf.py index eab00a1c..f58e75e9 100644 --- a/mindee/extraction/pdf_extractor/extracted_pdf.py +++ b/mindee/extraction/pdf_extractor/extracted_pdf.py @@ -1,5 +1,5 @@ from pathlib import Path -from typing import BinaryIO +from typing import BinaryIO, Union import pypdfium2 as pdfium @@ -31,7 +31,7 @@ def write_to_file(self, output_path: str): """Deprecated. Use ``save_to_file`` instead.""" self.save_to_file(output_path) - def save_to_file(self, output_path: str): + def save_to_file(self, output_path: Union[Path, str]): """ Writes the contents of the current PDF object to a file. diff --git a/mindee/v2/__init__.py b/mindee/v2/__init__.py index 81f408a4..5a8d973e 100644 --- a/mindee/v2/__init__.py +++ b/mindee/v2/__init__.py @@ -1,5 +1,8 @@ -from mindee.v2.file_operations.split import Split -from mindee.v2.file_operations.crop import Crop +from mindee.v2.file_operations.crop import ( + extract_crops, + extract_single_crop, +) +from mindee.v2.file_operations.split import extract_splits from mindee.v2.product.classification.classification_parameters import ( ClassificationParameters, ) @@ -14,14 +17,16 @@ from mindee.v2.product.split.split_response import SplitResponse __all__ = [ + "extract_crops", + "extract_splits", + "extract_crops", + "extract_single_crop", "ClassificationResponse", "ClassificationParameters", - "Crop", "CropResponse", "CropParameters", "OCRResponse", "OCRParameters", - "Split", "SplitResponse", "SplitParameters", ] diff --git a/mindee/v2/file_operations/__init__.py b/mindee/v2/file_operations/__init__.py index 674d3aec..d9065e40 100644 --- a/mindee/v2/file_operations/__init__.py +++ b/mindee/v2/file_operations/__init__.py @@ -1,4 +1,7 @@ -from mindee.v2.file_operations.crop import Crop -from mindee.v2.file_operations.split import Split +from mindee.v2.file_operations.crop import ( + extract_crops, + extract_single_crop, +) +from mindee.v2.file_operations.split import extract_splits -__all__ = ["Crop", "Split"] +__all__ = ["extract_crops", "extract_splits", "extract_crops", "extract_single_crop"] diff --git a/mindee/v2/file_operations/crop.py b/mindee/v2/file_operations/crop.py index f77c9d22..b2d416df 100644 --- a/mindee/v2/file_operations/crop.py +++ b/mindee/v2/file_operations/crop.py @@ -1,69 +1,51 @@ -from typing import List +from typing import List, Union from mindee.error import MindeeError from mindee.extraction import ExtractedImage, extract_multiple_images_from_source -from mindee.geometry import Polygon +from mindee.geometry import Point, Polygon from mindee.input.sources.local_input_source import LocalInputSource from mindee.parsing.v2.field import FieldLocation +from mindee.v2.file_operations.crop_files import CropFiles from mindee.v2.product.crop.crop_box import CropBox -class Crop: - """Crop operations for V2.""" - - @classmethod - def extract_single_crop( - cls, input_source: LocalInputSource, crop: FieldLocation - ) -> ExtractedImage: - """ - Extracts a single crop as complete PDFs from the document. - - :param input_source: Local Input Source to extract sub-receipts from. - :param crop: Crop to extract. - :return: ExtractedImage. - """ - - return extract_multiple_images_from_source( - input_source, crop.page, [crop.polygon] - )[0] - - @classmethod - def extract_crops( - cls, input_source: LocalInputSource, crops: List[CropBox] - ) -> List[ExtractedImage]: - """ - Extracts individual receipts from multi-receipts documents. - - :param input_source: Local Input Source to extract sub-receipts from. - :param crops: List of crops. - :return: Individual extracted receipts as an array of ExtractedImage. - """ - images: List[ExtractedImage] = [] - if not crops: - raise MindeeError("No possible candidates found for Crop extraction.") - polygons: List[List[Polygon]] = [[] for _ in range(input_source.page_count)] - for i, crop in enumerate(crops): - polygons[crop.location.page].append(crop.location.polygon) - for i, polygon in enumerate(polygons): - images.extend( - extract_multiple_images_from_source( - input_source, - i, - polygon, - ) +def extract_single_crop( + input_source: LocalInputSource, crop: FieldLocation +) -> ExtractedImage: + """ + Extracts a single crop as complete PDFs from the document. + + :param input_source: Local Input Source to extract sub-receipts from. + :param crop: Crop to extract. + :return: ExtractedImage. + """ + + polygons: List[Union[Polygon, List[Point]]] = [crop.polygon] + return extract_multiple_images_from_source(input_source, crop.page, polygons)[0] + + +def extract_crops(input_source: LocalInputSource, crops: List[CropBox]) -> CropFiles: + """ + Extracts individual receipts from multi-receipts documents. + + :param input_source: Local Input Source to extract sub-receipts from. + :param crops: List of crops. + :return: Individual extracted receipts as an array of ExtractedImage. + """ + images: List[ExtractedImage] = [] + if not crops: + raise MindeeError("No possible candidates found for Crop extraction.") + polygons: List[List[Union[Polygon, List[Point]]]] = [ + [] for _ in range(input_source.page_count) + ] + for i, crop in enumerate(crops): + polygons[crop.location.page].append(crop.location.polygon) + for i, polygon in enumerate(polygons): + images.extend( + extract_multiple_images_from_source( + input_source, + i, + polygon, ) - return images - - @classmethod - def apply( - cls, - input_source: LocalInputSource, - crops: List[CropBox], - ) -> List[ExtractedImage]: - """Crop a document into multiple pages. - - :param input_source: Input source to crop. - :param crops: List of crops. - """ - - return cls.extract_crops(input_source, crops) + ) + return CropFiles(images) diff --git a/mindee/v2/file_operations/crop_files.py b/mindee/v2/file_operations/crop_files.py new file mode 100644 index 00000000..9887b669 --- /dev/null +++ b/mindee/v2/file_operations/crop_files.py @@ -0,0 +1,20 @@ +from pathlib import Path +from typing import List, Union + +from mindee.extraction import ExtractedImage + + +class CropFiles(List[ExtractedImage]): + """Crop files.""" + + def save_all_to_disk(self, path: Union[Path, str]): + """ + Save all extracted crops to disk. + + :param path: Path to save the extracted splits to + """ + if isinstance(path, str): + path = Path(path) + path.mkdir(parents=True, exist_ok=True) + for idx, split in enumerate(self, start=1): + split.save_to_file(path / f"crop_{idx:03}.jpg") diff --git a/mindee/v2/file_operations/split.py b/mindee/v2/file_operations/split.py index 469451e0..e528e7b4 100644 --- a/mindee/v2/file_operations/split.py +++ b/mindee/v2/file_operations/split.py @@ -1,49 +1,33 @@ from typing import List, Union from mindee.error import MindeeError -from mindee.extraction import ExtractedPdf, PdfExtractor +from mindee.extraction import PdfExtractor from mindee.input.sources.local_input_source import LocalInputSource +from mindee.v2.file_operations.split_files import SplitFiles from mindee.v2.product.split.split_range import SplitRange -class Split: - """Split operations for V2.""" - - @classmethod - def extract_splits( - cls, - input_source: LocalInputSource, - splits: Union[List[SplitRange], List[List[int]]], - ) -> List[ExtractedPdf]: - """ - Extracts splits as complete PDFs from the document. - - :param input_source: Input source to split. - :param splits: List of sub-lists of pages to keep. - :return: A list of extracted invoices. - """ - pdf_extractor = PdfExtractor(input_source) - page_groups = [] - for split in splits: - if isinstance(split, SplitRange): - lower_bound = split.page_range[0] - upper_bound = split.page_range[1] - else: - lower_bound = split[0] - upper_bound = split[1] - page_groups.append(list(range(lower_bound, upper_bound + 1))) - if len(splits) < 1: - raise MindeeError("No indexes provided.") - return pdf_extractor.extract_sub_documents(page_groups) - - @classmethod - def apply( - cls, input_source: LocalInputSource, splits: List[SplitRange] - ) -> List[ExtractedPdf]: - """Split a document into multiple pages. - - :param input_source: Input source to split. - :param splits: List of splits. - """ - - return cls.extract_splits(input_source, splits) +def extract_splits( + input_source: LocalInputSource, + splits: Union[List[SplitRange], List[List[int]]], +) -> SplitFiles: + """ + Extracts splits as complete PDFs from the document. + + :param input_source: Input source to split. + :param splits: List of sub-lists of pages to keep. + :return: A list of extracted invoices. + """ + pdf_extractor = PdfExtractor(input_source) + page_groups = [] + for split in splits: + if isinstance(split, SplitRange): + lower_bound = split.page_range[0] + upper_bound = split.page_range[1] + else: + lower_bound = split[0] + upper_bound = split[1] + page_groups.append(list(range(lower_bound, upper_bound + 1))) + if len(splits) < 1: + raise MindeeError("No indexes provided.") + return SplitFiles(pdf_extractor.extract_sub_documents(page_groups)) diff --git a/mindee/v2/file_operations/split_files.py b/mindee/v2/file_operations/split_files.py new file mode 100644 index 00000000..8368ecb2 --- /dev/null +++ b/mindee/v2/file_operations/split_files.py @@ -0,0 +1,20 @@ +from pathlib import Path +from typing import List, Union + +from mindee.extraction.pdf_extractor.extracted_pdf import ExtractedPdf + + +class SplitFiles(List[ExtractedPdf]): + """Split files.""" + + def save_all_to_disk(self, path: Union[str, Path]): + """ + Save all extracted splits to disk. + + :param path: Path to save the extracted splits to + """ + if isinstance(path, str): + path = Path(path) + path.mkdir(parents=True, exist_ok=True) + for idx, split in enumerate(self, start=1): + split.save_to_file(path / f"split_{idx:03}.pdf") diff --git a/mindee/v2/product/crop/crop_box.py b/mindee/v2/product/crop/crop_box.py index 191b5d88..88cea161 100644 --- a/mindee/v2/product/crop/crop_box.py +++ b/mindee/v2/product/crop/crop_box.py @@ -19,7 +19,7 @@ def __init__(self, server_response: StringDict): def __str__(self) -> str: return f"* :Location: {self.location}\n :Object Type: {self.object_type}" - def apply_to_file(self, input_source: LocalInputSource) -> ExtractedImage: + def extract_from_file(self, input_source: LocalInputSource) -> ExtractedImage: """ Apply the split range inference to a file and return a single extracted PDF. diff --git a/mindee/v2/product/crop/crop_response.py b/mindee/v2/product/crop/crop_response.py index 74094633..3d2c35b0 100644 --- a/mindee/v2/product/crop/crop_response.py +++ b/mindee/v2/product/crop/crop_response.py @@ -1,11 +1,6 @@ -from typing import List - -from mindee.error import MindeeError -from mindee.extraction.common.extracted_image import ExtractedImage -from mindee.extraction.common.image_extractor import extract_multiple_images_from_source -from mindee.geometry import Polygon from mindee.input.sources.local_input_source import LocalInputSource from mindee.parsing.common.string_dict import StringDict +from mindee.v2.file_operations.crop_files import CropFiles from mindee.v2.parsing.inference import BaseResponse from mindee.v2.product.crop.crop_inference import CropInference @@ -23,30 +18,16 @@ def __init__(self, raw_response: StringDict) -> None: super().__init__(raw_response) self.inference = CropInference(raw_response["inference"]) - def apply_to_file(self, input_source: LocalInputSource) -> List[ExtractedImage]: + def extract_from_file(self, input_source: LocalInputSource) -> CropFiles: """ Apply the crop inference to a file and return a list of extracted images. :param input_source: Local file to apply the inference to :return: List of extracted PDFs """ - crops = self.inference.result.crops - if not crops: - raise MindeeError("No possible candidates found for Crop extraction.") - - polygons: List[List[Polygon]] = [[] for _ in range(input_source.page_count)] - for crop in crops: - polygons[crop.location.page].append(crop.location.polygon) - - images: List[ExtractedImage] = [] - for page_index, page_polygons in enumerate(polygons): - if not page_polygons: - continue - images.extend( - extract_multiple_images_from_source( - input_source, - page_index, - page_polygons, - ) - ) - return images + return CropFiles( + [ + crop.extract_from_file(input_source) + for crop in self.inference.result.crops + ] + ) diff --git a/mindee/v2/product/split/split_range.py b/mindee/v2/product/split/split_range.py index 4167290e..9888a930 100644 --- a/mindee/v2/product/split/split_range.py +++ b/mindee/v2/product/split/split_range.py @@ -25,7 +25,7 @@ def __str__(self) -> str: page_range = ",".join([str(page_index) for page_index in self.page_range]) return f"* :Page Range: {page_range}\n :Document Type: {self.document_type}" - def apply_to_file(self, input_source: LocalInputSource) -> ExtractedPdf: + def extract_from_file(self, input_source: LocalInputSource) -> ExtractedPdf: """ Apply the split range inference to a file and return a single extracted PDF. diff --git a/mindee/v2/product/split/split_response.py b/mindee/v2/product/split/split_response.py index 4bfec65a..b4770902 100644 --- a/mindee/v2/product/split/split_response.py +++ b/mindee/v2/product/split/split_response.py @@ -1,9 +1,6 @@ -from typing import List - -from mindee.extraction import ExtractedPdf from mindee.input.sources.local_input_source import LocalInputSource from mindee.parsing.common.string_dict import StringDict -from mindee.v2.file_operations.split import Split +from mindee.v2.file_operations.split_files import SplitFiles from mindee.v2.parsing.inference import BaseResponse from mindee.v2.product.split.split_inference import SplitInference @@ -21,11 +18,16 @@ def __init__(self, raw_response: StringDict) -> None: super().__init__(raw_response) self.inference = SplitInference(raw_response["inference"]) - def apply_to_file(self, input_source: LocalInputSource) -> List[ExtractedPdf]: + def extract_from_file(self, input_source: LocalInputSource) -> SplitFiles: """ Apply the split inference to a file and return a list of extracted PDFs. :param input_source: Local file to apply the inference to :return: List of extracted PDFs """ - return Split.extract_splits(input_source, self.inference.result.splits) + return SplitFiles( + [ + split.extract_from_file(input_source) + for split in self.inference.result.splits + ] + ) diff --git a/tests/v2/file_operations/test_crop_operation.py b/tests/v2/file_operations/test_crop_operation.py index 85a12367..24fb928c 100644 --- a/tests/v2/file_operations/test_crop_operation.py +++ b/tests/v2/file_operations/test_crop_operation.py @@ -3,7 +3,7 @@ import pytest from PIL import Image -from mindee.v2.file_operations.crop import Crop +from mindee.v2.file_operations.crop import extract_crops from mindee.input.sources.path_input import PathInput from mindee.v2.product.crop.crop_response import ( CropResponse, @@ -36,7 +36,7 @@ def test_single_page_crop_split(crops_single_page_path, crops_single_page_json_p with open(crops_single_page_json_path, "rb") as f: response = json.load(f) doc = CropResponse(response) - extracted_crops = Crop.extract_crops(input_sample, doc.inference.result.crops) + extracted_crops = extract_crops(input_sample, doc.inference.result.crops) assert len(extracted_crops) == 1 assert extracted_crops[0].page_id == 0 @@ -50,7 +50,7 @@ def test_multi_page_receipt_split(crops_multi_page_path, crops_multi_page_json_p with open(crops_multi_page_json_path, "rb") as f: response = json.load(f) doc = CropResponse(response) - extracted_crops = Crop.extract_crops(input_sample, doc.inference.result.crops) + extracted_crops = extract_crops(input_sample, doc.inference.result.crops) assert len(extracted_crops) == 2 assert extracted_crops[0].page_id == 0 diff --git a/tests/v2/file_operations/test_crop_operation_integration.py b/tests/v2/file_operations/test_crop_operation_integration.py index 3a5b8dbe..a7541a32 100644 --- a/tests/v2/file_operations/test_crop_operation_integration.py +++ b/tests/v2/file_operations/test_crop_operation_integration.py @@ -11,7 +11,7 @@ CropResponse, ) from mindee.input.sources.path_input import PathInput -from mindee.v2 import Crop +from mindee.v2.file_operations.crop import extract_crops from tests.utils import OUTPUT_DIR, V2_PRODUCT_DATA_DIR, cleanup_output_files @@ -36,7 +36,7 @@ def test_image_should_extract_crops(): ) assert len(response.inference.result.crops) == 2 - extracted_images = Crop.extract_crops(crop_input, response.inference.result.crops) + extracted_images = extract_crops(crop_input, response.inference.result.crops) assert len(extracted_images) == 2 assert extracted_images[0].filename == "default_sample.jpg_page1-0.jpg" @@ -50,8 +50,7 @@ def test_image_should_extract_crops(): ), ) check_findoc_return(invoice_0) - for i, extracted_image in enumerate(extracted_images): - extracted_image.save_to_file(OUTPUT_DIR / f"crop_{i + 1:03d}.jpg") + extracted_images.save_all_to_disk(OUTPUT_DIR) assert os.path.getsize(OUTPUT_DIR / "crop_001.jpg") == 198887 assert os.path.getsize(OUTPUT_DIR / "crop_002.jpg") == 197443 diff --git a/tests/v2/file_operations/test_split_operation.py b/tests/v2/file_operations/test_split_operation.py index dc873f4f..8e70b5ac 100644 --- a/tests/v2/file_operations/test_split_operation.py +++ b/tests/v2/file_operations/test_split_operation.py @@ -2,7 +2,7 @@ import pytest -from mindee.v2.file_operations.split import Split +from mindee.v2.file_operations.split import extract_splits from mindee.input.sources.path_input import PathInput from mindee.v2.product.split.split_response import ( SplitResponse, @@ -37,7 +37,7 @@ def test_single_page_split_split(splits_default, splits_single_page_json_path): with open(splits_single_page_json_path, "rb") as f: response = json.load(f) doc = SplitResponse(response) - extracted_splits = Split.extract_splits(input_sample, doc.inference.result.splits) + extracted_splits = extract_splits(input_sample, doc.inference.result.splits) assert len(extracted_splits) == 1 assert extracted_splits[0].get_page_count() == 1 @@ -48,7 +48,7 @@ def test_multi_page_receipt_split(splits_5p, splits_multi_page_json_path): with open(splits_multi_page_json_path, "rb") as f: response = json.load(f) doc = SplitResponse(response) - extracted_splits = Split.extract_splits(input_sample, doc.inference.result.splits) + extracted_splits = extract_splits(input_sample, doc.inference.result.splits) assert len(extracted_splits) == 3 assert extracted_splits[0].get_page_count() == 1 diff --git a/tests/v2/file_operations/test_split_operation_integration.py b/tests/v2/file_operations/test_split_operation_integration.py index e37a83e7..9fba4ad9 100644 --- a/tests/v2/file_operations/test_split_operation_integration.py +++ b/tests/v2/file_operations/test_split_operation_integration.py @@ -10,7 +10,7 @@ SplitResponse, ) from mindee.input.sources.path_input import PathInput -from mindee.v2 import Split +from mindee.v2.file_operations.split import extract_splits from tests.utils import OUTPUT_DIR, V2_PRODUCT_DATA_DIR, cleanup_output_files @@ -38,7 +38,7 @@ def test_pdf_should_extract_splits(): ) assert response.inference.file.page_count == 2 - extracted_pdfs = Split.extract_splits(split_input, response.inference.result.splits) + extracted_pdfs = extract_splits(split_input, response.inference.result.splits) assert len(extracted_pdfs) == 2 assert extracted_pdfs[0].filename == "default_sample_001-001.pdf" @@ -52,8 +52,7 @@ def test_pdf_should_extract_splits(): ), ) check_findoc_return(invoice_0) - for i, extracted_pdf in enumerate(extracted_pdfs): - extracted_pdf.save_to_file(OUTPUT_DIR / f"split_{i + 1:03d}.pdf") + extracted_pdfs.save_all_to_disk(OUTPUT_DIR) for i in range(len(extracted_pdfs)): local_input = PathInput(OUTPUT_DIR / f"split_{i + 1:03d}.pdf") try: