mindee · sebastianMindee · Mar 30, 2026 · Mar 30, 2026 · Mar 30, 2026 · Mar 30, 2026
diff --git a/.github/workflows/cron.yml b/.github/workflows/cron.yml
@@ -9,5 +9,5 @@
     uses: mindee/mindee-api-python/.github/workflows/_test-regressions.yml@main
     secrets: inherit
   test-code-samples:
-    uses: mindee/mindee-api-python/.github/workflows/_smoke_test.yml@main
+    uses: mindee/mindee-api-python/.github/workflows/_smoke-test.yml@main
     secrets: inherit
diff --git a/examples/auto_multi_receipts_extraction_example.py b/examples/auto_multi_receipts_extraction_example.py
@@ -16,7 +16,9 @@ def parse_receipts(input_path):
     extracted_receipts = extract_receipts(input_doc, result_split.document.inference)
 
     for idx, receipt in enumerate(extracted_receipts, 1):
-        result_receipt = mindee_client.parse(product.ReceiptV5, receipt.as_source())
+        result_receipt = mindee_client.parse(
+            product.ReceiptV5, receipt.as_input_source()
+        )
         print(f"Receipt {idx}:")
         print(result_receipt.document)
         print("-" * 40)

diff --git a/mindee/extraction/common/extracted_image.py b/mindee/extraction/common/extracted_image.py
@@ -1,6 +1,6 @@
 import io
 from pathlib import Path
-from typing import Optional
+from typing import Optional, Union
 
 from PIL import Image
 
@@ -17,6 +17,8 @@ class ExtractedImage:
     """Id of the page the image was extracted from."""
     _element_id: int
     """Id of the element on a given page."""
+    filename: str
+    """Name of the file the image was extracted from."""
 
     def __init__(
         self, input_source: LocalInputSource, page_id: int, element_id: int
@@ -30,6 +32,7 @@ def __init__(
         """
         self.buffer = io.BytesIO(input_source.file_object.read())
         self.buffer.name = input_source.filename
+        self.filename = input_source.filename
         if input_source.is_pdf():
             extension = "jpg"
         else:
@@ -43,7 +46,9 @@ def __init__(
         self._page_id = page_id
         self._element_id = 0 if element_id is None else element_id
 
-    def save_to_file(self, output_path: str, file_format: Optional[str] = None):
+    def save_to_file(
+        self, output_path: Union[Path, str], file_format: Optional[str] = None
+    ):
         """
         Saves the document to a file.
 
@@ -56,20 +61,27 @@ def save_to_file(self, output_path: str, file_format: Optional[str] = None):
             if not file_format:
                 if len(resolved_path.suffix) < 1:
                     raise ValueError("Invalid file format.")
-                file_format = (
-                    resolved_path.suffix.upper()
-                )  # technically redundant since PIL applies an upper operation
-                # to the parameter , but older versions may not do so.
+                # Let PIL infer format from filename extension
             self.buffer.seek(0)
             image = Image.open(self.buffer)
-            image.save(resolved_path, format=file_format)
+            if file_format:
+                image.save(resolved_path, format=file_format)
+            else:
+                image.save(resolved_path)
             logger.info("File saved successfully to '%s'.", resolved_path)
         except TypeError as exc:
             raise MindeeError("Invalid path/filename provided.") from exc
         except Exception as exc:
+            print(exc)
             raise MindeeError(f"Could not save file {Path(output_path).name}.") from exc
 
     def as_source(self) -> FileInput:
+        """
+        Deprecated. Use ``as_input_source`` instead.
+        """
+        return self.as_input_source()
+
+    def as_input_source(self) -> FileInput:
         """
         Return the file as a Mindee-compatible BufferInput source.
 

diff --git a/mindee/extraction/common/image_extractor.py b/mindee/extraction/common/image_extractor.py
@@ -1,13 +1,13 @@
 import io
-from typing import BinaryIO, List
+from typing import BinaryIO, List, Union
 
 import pypdfium2 as pdfium
 from PIL import Image
 
 from mindee.error.mindee_error import MindeeError
 from mindee.extraction.common.extracted_image import ExtractedImage
 from mindee.geometry.point import Point
-from mindee.geometry.polygon import get_min_max_x, get_min_max_y
+from mindee.geometry.polygon import Polygon, get_min_max_x, get_min_max_y
 from mindee.input.sources.bytes_input import BytesInput
 from mindee.input.sources.local_input_source import LocalInputSource
 
@@ -114,7 +114,9 @@ def get_file_extension(file_format: str):
 
 
 def extract_multiple_images_from_source(
-    input_source: LocalInputSource, page_id: int, polygons: List[List[Point]]
+    input_source: LocalInputSource,
+    page_id: int,
+    polygons: List[Union[Polygon, List[Point]]],
 ) -> List[ExtractedImage]:
     """
     Extracts elements from a page based on a list of bounding boxes.

diff --git a/mindee/extraction/pdf_extractor/extracted_pdf.py b/mindee/extraction/pdf_extractor/extracted_pdf.py
@@ -1,5 +1,5 @@
 from pathlib import Path
-from typing import BinaryIO
+from typing import BinaryIO, Union
 
 import pypdfium2 as pdfium
 
@@ -28,6 +28,10 @@ def get_page_count(self) -> int:
             ) from exc
 
     def write_to_file(self, output_path: str):
+        """Deprecated. Use ``save_to_file`` instead."""
+        self.save_to_file(output_path)
+
+    def save_to_file(self, output_path: Union[Path, str]):
         """
         Writes the contents of the current PDF object to a file.
 
@@ -40,6 +44,7 @@ def write_to_file(self, output_path: str):
             raise MindeeError("Invalid save path provided {}.")
         if out_path.suffix.lower() != "pdf":
             out_path = out_path.parent / (out_path.stem + "." + "pdf")
+        self.pdf_bytes.seek(0)
         with open(out_path, "wb") as out_file:
             out_file.write(self.pdf_bytes.read())
 

diff --git a/mindee/mindee_http/mindee_api_v2.py b/mindee/mindee_http/mindee_api_v2.py
@@ -4,7 +4,9 @@
 import requests
 
 from mindee.error.mindee_error import MindeeApiV2Error
-from mindee.input import LocalInputSource, UrlInputSource, BaseParameters
+from mindee.input.base_parameters import BaseParameters
+from mindee.input.sources.local_input_source import LocalInputSource
+from mindee.input.sources.url_input_source import UrlInputSource
 from mindee.logger import logger
 from mindee.mindee_http.base_settings import USER_AGENT
 from mindee.mindee_http.settings_mixin import SettingsMixin

diff --git a/mindee/mindee_http/workflow_endpoint.py b/mindee/mindee_http/workflow_endpoint.py
@@ -2,7 +2,9 @@
 
 import requests
 
-from mindee.input import LocalInputSource, UrlInputSource, WorkflowOptions
+from mindee.input.sources.local_input_source import LocalInputSource
+from mindee.input.sources.url_input_source import UrlInputSource
+from mindee.input.workflow_options import WorkflowOptions
 from mindee.mindee_http.base_endpoint import BaseEndpoint
 from mindee.mindee_http.workflow_settings import WorkflowSettings
 

diff --git a/mindee/v2/__init__.py b/mindee/v2/__init__.py
@@ -1,3 +1,8 @@
+from mindee.v2.file_operations.crop import (
+    extract_crops,
+    extract_single_crop,
+)
+from mindee.v2.file_operations.split import extract_splits
 from mindee.v2.product.classification.classification_parameters import (
     ClassificationParameters,
 )
@@ -12,6 +17,10 @@
 from mindee.v2.product.split.split_response import SplitResponse
 
 __all__ = [
+    "extract_crops",
+    "extract_splits",
+    "extract_crops",
+    "extract_single_crop",
     "ClassificationResponse",
     "ClassificationParameters",
     "CropResponse",

diff --git a/mindee/v2/file_operations/__init__.py b/mindee/v2/file_operations/__init__.py
@@ -0,0 +1,7 @@
+from mindee.v2.file_operations.crop import (
+    extract_crops,
+    extract_single_crop,
+)
+from mindee.v2.file_operations.split import extract_splits
+
+__all__ = ["extract_crops", "extract_splits", "extract_crops", "extract_single_crop"]
diff --git a/mindee/v2/file_operations/crop.py b/mindee/v2/file_operations/crop.py
@@ -0,0 +1,51 @@
+from typing import List, Union
+
+from mindee.error import MindeeError
+from mindee.extraction import ExtractedImage, extract_multiple_images_from_source
+from mindee.geometry import Point, Polygon
+from mindee.input.sources.local_input_source import LocalInputSource
+from mindee.parsing.v2.field import FieldLocation
+from mindee.v2.file_operations.crop_files import CropFiles
+from mindee.v2.product.crop.crop_box import CropBox
+
+
+def extract_single_crop(
+    input_source: LocalInputSource, crop: FieldLocation
+) -> ExtractedImage:
+    """
+    Extracts a single crop as complete PDFs from the document.
+
+    :param input_source: Local Input Source to extract sub-receipts from.
+    :param crop: Crop to extract.
+    :return: ExtractedImage.
+    """
+
+    polygons: List[Union[Polygon, List[Point]]] = [crop.polygon]
+    return extract_multiple_images_from_source(input_source, crop.page, polygons)[0]
+
+
+def extract_crops(input_source: LocalInputSource, crops: List[CropBox]) -> CropFiles:
+    """
+    Extracts individual receipts from multi-receipts documents.
+
+    :param input_source: Local Input Source to extract sub-receipts from.
+    :param crops: List of crops.
+    :return: Individual extracted receipts as an array of ExtractedImage.
+    """
+    images: List[ExtractedImage] = []
+    if not crops:
+        raise MindeeError("No possible candidates found for Crop extraction.")
+    polygons: List[List[Union[Polygon, List[Point]]]] = [
+        [] for _ in range(input_source.page_count)
+    ]
+    for i, crop in enumerate(crops):
+        polygons[crop.location.page].append(crop.location.polygon)
+    for i, polygon in enumerate(polygons):
+        images.extend(
+            extract_multiple_images_from_source(
+                input_source,
+                i,
+                polygon,
+            )
+        )
+    return CropFiles(images)
diff --git a/mindee/v2/file_operations/crop_files.py b/mindee/v2/file_operations/crop_files.py
@@ -0,0 +1,20 @@
+from pathlib import Path
+from typing import List, Union
+
+from mindee.extraction import ExtractedImage
+
+
+class CropFiles(List[ExtractedImage]):
+    """Crop files."""
+
+    def save_all_to_disk(self, path: Union[Path, str]):
+        """
+        Save all extracted crops to disk.
+
+        :param path: Path to save the extracted splits to
+        """
+        if isinstance(path, str):
+            path = Path(path)
+        path.mkdir(parents=True, exist_ok=True)
+        for idx, split in enumerate(self, start=1):
+            split.save_to_file(path / f"crop_{idx:03}.jpg")
diff --git a/mindee/v2/file_operations/split.py b/mindee/v2/file_operations/split.py
@@ -0,0 +1,33 @@
+from typing import List, Union
+
+from mindee.error import MindeeError
+from mindee.extraction import PdfExtractor
+from mindee.input.sources.local_input_source import LocalInputSource
+from mindee.v2.file_operations.split_files import SplitFiles
+from mindee.v2.product.split.split_range import SplitRange
+
+
+def extract_splits(
+    input_source: LocalInputSource,
+    splits: Union[List[SplitRange], List[List[int]]],
+) -> SplitFiles:
+    """
+    Extracts splits as complete PDFs from the document.
+
+    :param input_source: Input source to split.
+    :param splits: List of sub-lists of pages to keep.
+    :return: A list of extracted invoices.
+    """
+    pdf_extractor = PdfExtractor(input_source)
+    page_groups = []
+    for split in splits:
+        if isinstance(split, SplitRange):
+            lower_bound = split.page_range[0]
+            upper_bound = split.page_range[1]
+        else:
+            lower_bound = split[0]
+            upper_bound = split[1]
+        page_groups.append(list(range(lower_bound, upper_bound + 1)))
+    if len(splits) < 1:
+        raise MindeeError("No indexes provided.")
+    return SplitFiles(pdf_extractor.extract_sub_documents(page_groups))
diff --git a/mindee/v2/file_operations/split_files.py b/mindee/v2/file_operations/split_files.py
@@ -0,0 +1,20 @@
+from pathlib import Path
+from typing import List, Union
+
+from mindee.extraction.pdf_extractor.extracted_pdf import ExtractedPdf
+
+
+class SplitFiles(List[ExtractedPdf]):
+    """Split files."""
+
+    def save_all_to_disk(self, path: Union[str, Path]):
+        """
+        Save all extracted splits to disk.
+
+        :param path: Path to save the extracted splits to
+        """
+        if isinstance(path, str):
+            path = Path(path)
+        path.mkdir(parents=True, exist_ok=True)
+        for idx, split in enumerate(self, start=1):
+            split.save_to_file(path / f"split_{idx:03}.pdf")
diff --git a/mindee/v2/product/crop/crop_box.py b/mindee/v2/product/crop/crop_box.py
@@ -1,3 +1,5 @@
+from mindee.extraction import ExtractedImage, extract_multiple_images_from_source
+from mindee.input.sources.local_input_source import LocalInputSource
 from mindee.parsing.common.string_dict import StringDict
 from mindee.parsing.v2.field.field_location import FieldLocation
 
@@ -16,3 +18,14 @@ def __init__(self, server_response: StringDict):
 
     def __str__(self) -> str:
         return f"* :Location: {self.location}\n  :Object Type: {self.object_type}"
+
+    def extract_from_file(self, input_source: LocalInputSource) -> ExtractedImage:
+        """
+        Apply the split range inference to a file and return a single extracted PDF.
+
+        :param input_source: Local file to apply the inference to
+        :return: Extracted PDF
+        """
+        return extract_multiple_images_from_source(
+            input_source, self.location.page, [self.location.polygon]
+        )[0]
diff --git a/mindee/v2/product/crop/crop_response.py b/mindee/v2/product/crop/crop_response.py
@@ -1,4 +1,6 @@
+from mindee.input.sources.local_input_source import LocalInputSource
 from mindee.parsing.common.string_dict import StringDict
+from mindee.v2.file_operations.crop_files import CropFiles
 from mindee.v2.parsing.inference import BaseResponse
 from mindee.v2.product.crop.crop_inference import CropInference
 
@@ -15,3 +17,17 @@ class CropResponse(BaseResponse):
     def __init__(self, raw_response: StringDict) -> None:
         super().__init__(raw_response)
         self.inference = CropInference(raw_response["inference"])
+
+    def extract_from_file(self, input_source: LocalInputSource) -> CropFiles:
+        """
+        Apply the crop inference to a file and return a list of extracted images.
+
+        :param input_source: Local file to apply the inference to
+        :return: List of extracted PDFs
+        """
+        return CropFiles(
+            [
+                crop.extract_from_file(input_source)
+                for crop in self.inference.result.crops
+            ]
+        )
diff --git a/mindee/v2/product/split/split_range.py b/mindee/v2/product/split/split_range.py
@@ -1,5 +1,8 @@
 from typing import List
 
+from mindee.extraction.pdf_extractor.extracted_pdf import ExtractedPdf
+from mindee.extraction.pdf_extractor.pdf_extractor import PdfExtractor
+from mindee.input.sources.local_input_source import LocalInputSource
 from mindee.parsing.common.string_dict import StringDict
 
 
@@ -21,3 +24,13 @@ def __init__(self, server_response: StringDict):
     def __str__(self) -> str:
         page_range = ",".join([str(page_index) for page_index in self.page_range])
         return f"* :Page Range: {page_range}\n  :Document Type: {self.document_type}"
+
+    def extract_from_file(self, input_source: LocalInputSource) -> ExtractedPdf:
+        """
+        Apply the split range inference to a file and return a single extracted PDF.
+
+        :param input_source: Local file to apply the inference to
+        :return: Extracted PDF
+        """
+        pdf_extractor = PdfExtractor(input_source)
+        return pdf_extractor.extract_sub_documents([self.page_range])[0]