diff --git a/.github/workflows/cron.yml b/.github/workflows/cron.yml index 3fb7ea53..42ce2bb8 100644 --- a/.github/workflows/cron.yml +++ b/.github/workflows/cron.yml @@ -9,5 +9,5 @@ jobs: uses: mindee/mindee-api-python/.github/workflows/_test-regressions.yml@main secrets: inherit test-code-samples: - uses: mindee/mindee-api-python/.github/workflows/_smoke_test.yml@main + uses: mindee/mindee-api-python/.github/workflows/_smoke-test.yml@main secrets: inherit diff --git a/examples/auto_multi_receipts_extraction_example.py b/examples/auto_multi_receipts_extraction_example.py index aa995f51..2906a155 100644 --- a/examples/auto_multi_receipts_extraction_example.py +++ b/examples/auto_multi_receipts_extraction_example.py @@ -16,7 +16,9 @@ def parse_receipts(input_path): extracted_receipts = extract_receipts(input_doc, result_split.document.inference) for idx, receipt in enumerate(extracted_receipts, 1): - result_receipt = mindee_client.parse(product.ReceiptV5, receipt.as_source()) + result_receipt = mindee_client.parse( + product.ReceiptV5, receipt.as_input_source() + ) print(f"Receipt {idx}:") print(result_receipt.document) print("-" * 40) diff --git a/mindee/extraction/common/extracted_image.py b/mindee/extraction/common/extracted_image.py index e4013246..66f7631a 100644 --- a/mindee/extraction/common/extracted_image.py +++ b/mindee/extraction/common/extracted_image.py @@ -1,6 +1,6 @@ import io from pathlib import Path -from typing import Optional +from typing import Optional, Union from PIL import Image @@ -17,6 +17,8 @@ class ExtractedImage: """Id of the page the image was extracted from.""" _element_id: int """Id of the element on a given page.""" + filename: str + """Name of the file the image was extracted from.""" def __init__( self, input_source: LocalInputSource, page_id: int, element_id: int @@ -30,6 +32,7 @@ def __init__( """ self.buffer = io.BytesIO(input_source.file_object.read()) self.buffer.name = input_source.filename + self.filename = input_source.filename if input_source.is_pdf(): extension = "jpg" else: @@ -43,7 +46,9 @@ def __init__( self._page_id = page_id self._element_id = 0 if element_id is None else element_id - def save_to_file(self, output_path: str, file_format: Optional[str] = None): + def save_to_file( + self, output_path: Union[Path, str], file_format: Optional[str] = None + ): """ Saves the document to a file. @@ -56,20 +61,27 @@ def save_to_file(self, output_path: str, file_format: Optional[str] = None): if not file_format: if len(resolved_path.suffix) < 1: raise ValueError("Invalid file format.") - file_format = ( - resolved_path.suffix.upper() - ) # technically redundant since PIL applies an upper operation - # to the parameter , but older versions may not do so. + # Let PIL infer format from filename extension self.buffer.seek(0) image = Image.open(self.buffer) - image.save(resolved_path, format=file_format) + if file_format: + image.save(resolved_path, format=file_format) + else: + image.save(resolved_path) logger.info("File saved successfully to '%s'.", resolved_path) except TypeError as exc: raise MindeeError("Invalid path/filename provided.") from exc except Exception as exc: + print(exc) raise MindeeError(f"Could not save file {Path(output_path).name}.") from exc def as_source(self) -> FileInput: + """ + Deprecated. Use ``as_input_source`` instead. + """ + return self.as_input_source() + + def as_input_source(self) -> FileInput: """ Return the file as a Mindee-compatible BufferInput source. diff --git a/mindee/extraction/common/image_extractor.py b/mindee/extraction/common/image_extractor.py index 5bae6d37..52c57d0d 100644 --- a/mindee/extraction/common/image_extractor.py +++ b/mindee/extraction/common/image_extractor.py @@ -1,5 +1,5 @@ import io -from typing import BinaryIO, List +from typing import BinaryIO, List, Union import pypdfium2 as pdfium from PIL import Image @@ -7,7 +7,7 @@ from mindee.error.mindee_error import MindeeError from mindee.extraction.common.extracted_image import ExtractedImage from mindee.geometry.point import Point -from mindee.geometry.polygon import get_min_max_x, get_min_max_y +from mindee.geometry.polygon import Polygon, get_min_max_x, get_min_max_y from mindee.input.sources.bytes_input import BytesInput from mindee.input.sources.local_input_source import LocalInputSource @@ -114,7 +114,9 @@ def get_file_extension(file_format: str): def extract_multiple_images_from_source( - input_source: LocalInputSource, page_id: int, polygons: List[List[Point]] + input_source: LocalInputSource, + page_id: int, + polygons: List[Union[Polygon, List[Point]]], ) -> List[ExtractedImage]: """ Extracts elements from a page based on a list of bounding boxes. diff --git a/mindee/extraction/pdf_extractor/extracted_pdf.py b/mindee/extraction/pdf_extractor/extracted_pdf.py index 0e3dcb8d..f58e75e9 100644 --- a/mindee/extraction/pdf_extractor/extracted_pdf.py +++ b/mindee/extraction/pdf_extractor/extracted_pdf.py @@ -1,5 +1,5 @@ from pathlib import Path -from typing import BinaryIO +from typing import BinaryIO, Union import pypdfium2 as pdfium @@ -28,6 +28,10 @@ def get_page_count(self) -> int: ) from exc def write_to_file(self, output_path: str): + """Deprecated. Use ``save_to_file`` instead.""" + self.save_to_file(output_path) + + def save_to_file(self, output_path: Union[Path, str]): """ Writes the contents of the current PDF object to a file. @@ -40,6 +44,7 @@ def write_to_file(self, output_path: str): raise MindeeError("Invalid save path provided {}.") if out_path.suffix.lower() != "pdf": out_path = out_path.parent / (out_path.stem + "." + "pdf") + self.pdf_bytes.seek(0) with open(out_path, "wb") as out_file: out_file.write(self.pdf_bytes.read()) diff --git a/mindee/mindee_http/mindee_api_v2.py b/mindee/mindee_http/mindee_api_v2.py index 446582ad..968bb813 100644 --- a/mindee/mindee_http/mindee_api_v2.py +++ b/mindee/mindee_http/mindee_api_v2.py @@ -4,7 +4,9 @@ import requests from mindee.error.mindee_error import MindeeApiV2Error -from mindee.input import LocalInputSource, UrlInputSource, BaseParameters +from mindee.input.base_parameters import BaseParameters +from mindee.input.sources.local_input_source import LocalInputSource +from mindee.input.sources.url_input_source import UrlInputSource from mindee.logger import logger from mindee.mindee_http.base_settings import USER_AGENT from mindee.mindee_http.settings_mixin import SettingsMixin diff --git a/mindee/mindee_http/workflow_endpoint.py b/mindee/mindee_http/workflow_endpoint.py index 7cd4cd2b..ed437d1e 100644 --- a/mindee/mindee_http/workflow_endpoint.py +++ b/mindee/mindee_http/workflow_endpoint.py @@ -2,7 +2,9 @@ import requests -from mindee.input import LocalInputSource, UrlInputSource, WorkflowOptions +from mindee.input.sources.local_input_source import LocalInputSource +from mindee.input.sources.url_input_source import UrlInputSource +from mindee.input.workflow_options import WorkflowOptions from mindee.mindee_http.base_endpoint import BaseEndpoint from mindee.mindee_http.workflow_settings import WorkflowSettings diff --git a/mindee/v2/__init__.py b/mindee/v2/__init__.py index c97f7080..5a8d973e 100644 --- a/mindee/v2/__init__.py +++ b/mindee/v2/__init__.py @@ -1,3 +1,8 @@ +from mindee.v2.file_operations.crop import ( + extract_crops, + extract_single_crop, +) +from mindee.v2.file_operations.split import extract_splits from mindee.v2.product.classification.classification_parameters import ( ClassificationParameters, ) @@ -12,6 +17,10 @@ from mindee.v2.product.split.split_response import SplitResponse __all__ = [ + "extract_crops", + "extract_splits", + "extract_crops", + "extract_single_crop", "ClassificationResponse", "ClassificationParameters", "CropResponse", diff --git a/mindee/v2/file_operations/__init__.py b/mindee/v2/file_operations/__init__.py new file mode 100644 index 00000000..d9065e40 --- /dev/null +++ b/mindee/v2/file_operations/__init__.py @@ -0,0 +1,7 @@ +from mindee.v2.file_operations.crop import ( + extract_crops, + extract_single_crop, +) +from mindee.v2.file_operations.split import extract_splits + +__all__ = ["extract_crops", "extract_splits", "extract_crops", "extract_single_crop"] diff --git a/mindee/v2/file_operations/crop.py b/mindee/v2/file_operations/crop.py new file mode 100644 index 00000000..b2d416df --- /dev/null +++ b/mindee/v2/file_operations/crop.py @@ -0,0 +1,51 @@ +from typing import List, Union + +from mindee.error import MindeeError +from mindee.extraction import ExtractedImage, extract_multiple_images_from_source +from mindee.geometry import Point, Polygon +from mindee.input.sources.local_input_source import LocalInputSource +from mindee.parsing.v2.field import FieldLocation +from mindee.v2.file_operations.crop_files import CropFiles +from mindee.v2.product.crop.crop_box import CropBox + + +def extract_single_crop( + input_source: LocalInputSource, crop: FieldLocation +) -> ExtractedImage: + """ + Extracts a single crop as complete PDFs from the document. + + :param input_source: Local Input Source to extract sub-receipts from. + :param crop: Crop to extract. + :return: ExtractedImage. + """ + + polygons: List[Union[Polygon, List[Point]]] = [crop.polygon] + return extract_multiple_images_from_source(input_source, crop.page, polygons)[0] + + +def extract_crops(input_source: LocalInputSource, crops: List[CropBox]) -> CropFiles: + """ + Extracts individual receipts from multi-receipts documents. + + :param input_source: Local Input Source to extract sub-receipts from. + :param crops: List of crops. + :return: Individual extracted receipts as an array of ExtractedImage. + """ + images: List[ExtractedImage] = [] + if not crops: + raise MindeeError("No possible candidates found for Crop extraction.") + polygons: List[List[Union[Polygon, List[Point]]]] = [ + [] for _ in range(input_source.page_count) + ] + for i, crop in enumerate(crops): + polygons[crop.location.page].append(crop.location.polygon) + for i, polygon in enumerate(polygons): + images.extend( + extract_multiple_images_from_source( + input_source, + i, + polygon, + ) + ) + return CropFiles(images) diff --git a/mindee/v2/file_operations/crop_files.py b/mindee/v2/file_operations/crop_files.py new file mode 100644 index 00000000..9887b669 --- /dev/null +++ b/mindee/v2/file_operations/crop_files.py @@ -0,0 +1,20 @@ +from pathlib import Path +from typing import List, Union + +from mindee.extraction import ExtractedImage + + +class CropFiles(List[ExtractedImage]): + """Crop files.""" + + def save_all_to_disk(self, path: Union[Path, str]): + """ + Save all extracted crops to disk. + + :param path: Path to save the extracted splits to + """ + if isinstance(path, str): + path = Path(path) + path.mkdir(parents=True, exist_ok=True) + for idx, split in enumerate(self, start=1): + split.save_to_file(path / f"crop_{idx:03}.jpg") diff --git a/mindee/v2/file_operations/split.py b/mindee/v2/file_operations/split.py new file mode 100644 index 00000000..e528e7b4 --- /dev/null +++ b/mindee/v2/file_operations/split.py @@ -0,0 +1,33 @@ +from typing import List, Union + +from mindee.error import MindeeError +from mindee.extraction import PdfExtractor +from mindee.input.sources.local_input_source import LocalInputSource +from mindee.v2.file_operations.split_files import SplitFiles +from mindee.v2.product.split.split_range import SplitRange + + +def extract_splits( + input_source: LocalInputSource, + splits: Union[List[SplitRange], List[List[int]]], +) -> SplitFiles: + """ + Extracts splits as complete PDFs from the document. + + :param input_source: Input source to split. + :param splits: List of sub-lists of pages to keep. + :return: A list of extracted invoices. + """ + pdf_extractor = PdfExtractor(input_source) + page_groups = [] + for split in splits: + if isinstance(split, SplitRange): + lower_bound = split.page_range[0] + upper_bound = split.page_range[1] + else: + lower_bound = split[0] + upper_bound = split[1] + page_groups.append(list(range(lower_bound, upper_bound + 1))) + if len(splits) < 1: + raise MindeeError("No indexes provided.") + return SplitFiles(pdf_extractor.extract_sub_documents(page_groups)) diff --git a/mindee/v2/file_operations/split_files.py b/mindee/v2/file_operations/split_files.py new file mode 100644 index 00000000..8368ecb2 --- /dev/null +++ b/mindee/v2/file_operations/split_files.py @@ -0,0 +1,20 @@ +from pathlib import Path +from typing import List, Union + +from mindee.extraction.pdf_extractor.extracted_pdf import ExtractedPdf + + +class SplitFiles(List[ExtractedPdf]): + """Split files.""" + + def save_all_to_disk(self, path: Union[str, Path]): + """ + Save all extracted splits to disk. + + :param path: Path to save the extracted splits to + """ + if isinstance(path, str): + path = Path(path) + path.mkdir(parents=True, exist_ok=True) + for idx, split in enumerate(self, start=1): + split.save_to_file(path / f"split_{idx:03}.pdf") diff --git a/mindee/v2/product/crop/crop_box.py b/mindee/v2/product/crop/crop_box.py index 62a32840..88cea161 100644 --- a/mindee/v2/product/crop/crop_box.py +++ b/mindee/v2/product/crop/crop_box.py @@ -1,3 +1,5 @@ +from mindee.extraction import ExtractedImage, extract_multiple_images_from_source +from mindee.input.sources.local_input_source import LocalInputSource from mindee.parsing.common.string_dict import StringDict from mindee.parsing.v2.field.field_location import FieldLocation @@ -16,3 +18,14 @@ def __init__(self, server_response: StringDict): def __str__(self) -> str: return f"* :Location: {self.location}\n :Object Type: {self.object_type}" + + def extract_from_file(self, input_source: LocalInputSource) -> ExtractedImage: + """ + Apply the split range inference to a file and return a single extracted PDF. + + :param input_source: Local file to apply the inference to + :return: Extracted PDF + """ + return extract_multiple_images_from_source( + input_source, self.location.page, [self.location.polygon] + )[0] diff --git a/mindee/v2/product/crop/crop_response.py b/mindee/v2/product/crop/crop_response.py index 03b4be0e..3d2c35b0 100644 --- a/mindee/v2/product/crop/crop_response.py +++ b/mindee/v2/product/crop/crop_response.py @@ -1,4 +1,6 @@ +from mindee.input.sources.local_input_source import LocalInputSource from mindee.parsing.common.string_dict import StringDict +from mindee.v2.file_operations.crop_files import CropFiles from mindee.v2.parsing.inference import BaseResponse from mindee.v2.product.crop.crop_inference import CropInference @@ -15,3 +17,17 @@ class CropResponse(BaseResponse): def __init__(self, raw_response: StringDict) -> None: super().__init__(raw_response) self.inference = CropInference(raw_response["inference"]) + + def extract_from_file(self, input_source: LocalInputSource) -> CropFiles: + """ + Apply the crop inference to a file and return a list of extracted images. + + :param input_source: Local file to apply the inference to + :return: List of extracted PDFs + """ + return CropFiles( + [ + crop.extract_from_file(input_source) + for crop in self.inference.result.crops + ] + ) diff --git a/mindee/v2/product/split/split_range.py b/mindee/v2/product/split/split_range.py index e0e70110..9888a930 100644 --- a/mindee/v2/product/split/split_range.py +++ b/mindee/v2/product/split/split_range.py @@ -1,5 +1,8 @@ from typing import List +from mindee.extraction.pdf_extractor.extracted_pdf import ExtractedPdf +from mindee.extraction.pdf_extractor.pdf_extractor import PdfExtractor +from mindee.input.sources.local_input_source import LocalInputSource from mindee.parsing.common.string_dict import StringDict @@ -21,3 +24,13 @@ def __init__(self, server_response: StringDict): def __str__(self) -> str: page_range = ",".join([str(page_index) for page_index in self.page_range]) return f"* :Page Range: {page_range}\n :Document Type: {self.document_type}" + + def extract_from_file(self, input_source: LocalInputSource) -> ExtractedPdf: + """ + Apply the split range inference to a file and return a single extracted PDF. + + :param input_source: Local file to apply the inference to + :return: Extracted PDF + """ + pdf_extractor = PdfExtractor(input_source) + return pdf_extractor.extract_sub_documents([self.page_range])[0] diff --git a/mindee/v2/product/split/split_response.py b/mindee/v2/product/split/split_response.py index ec5112ef..b4770902 100644 --- a/mindee/v2/product/split/split_response.py +++ b/mindee/v2/product/split/split_response.py @@ -1,4 +1,6 @@ +from mindee.input.sources.local_input_source import LocalInputSource from mindee.parsing.common.string_dict import StringDict +from mindee.v2.file_operations.split_files import SplitFiles from mindee.v2.parsing.inference import BaseResponse from mindee.v2.product.split.split_inference import SplitInference @@ -15,3 +17,17 @@ class SplitResponse(BaseResponse): def __init__(self, raw_response: StringDict) -> None: super().__init__(raw_response) self.inference = SplitInference(raw_response["inference"]) + + def extract_from_file(self, input_source: LocalInputSource) -> SplitFiles: + """ + Apply the split inference to a file and return a list of extracted PDFs. + + :param input_source: Local file to apply the inference to + :return: List of extracted PDFs + """ + return SplitFiles( + [ + split.extract_from_file(input_source) + for split in self.inference.result.splits + ] + ) diff --git a/tests/data b/tests/data index c2e36f5b..53f0efbc 160000 --- a/tests/data +++ b/tests/data @@ -1 +1 @@ -Subproject commit c2e36f5b635386cb9bb922b517c4e02039b0a122 +Subproject commit 53f0efbc08c77c2c085aadd27de9d2d6c359276e diff --git a/tests/input/test_compression.py b/tests/input/test_compression.py index b3b7c767..fde5d064 100644 --- a/tests/input/test_compression.py +++ b/tests/input/test_compression.py @@ -9,9 +9,14 @@ from mindee.input.sources import PathInput from mindee.pdf.pdf_compressor import compress_pdf from mindee.pdf.pdf_utils import extract_text_from_pdf -from tests.utils import FILE_TYPES_DIR, ROOT_DATA_DIR, V1_DATA_DIR, V1_PRODUCT_DATA_DIR +from tests.utils import ( + FILE_TYPES_DIR, + OUTPUT_DIR, + V1_DATA_DIR, + V1_PRODUCT_DATA_DIR, + cleanup_output_files, +) -OUTPUT_DIR = ROOT_DATA_DIR / "output" RECEIPT_PATH = FILE_TYPES_DIR / "receipt.jpg" @@ -202,26 +207,23 @@ def test_pdf_compress_with_text_does_not_compress(): @pytest.fixture(scope="module", autouse=True) def cleanup(): yield - created_files = [ - "compress10.pdf", - "compress50.pdf", - "compress75.pdf", - "compress85.pdf", - "resize_indirect.pdf", - "compress1.jpg", - "compress10.jpg", - "compress50.jpg", - "compress75.jpg", - "compress100.jpg", - "compress_indirect.jpg", - "resize250x500.jpg", - "resize500x250.jpg", - "resize500xnull.jpg", - "resize_indirect.jpg", - "resizenullx250.jpg", - ] - - for file_path in created_files: - full_path = OUTPUT_DIR / file_path - if full_path.exists(): - os.remove(full_path) + cleanup_output_files( + [ + "compress10.pdf", + "compress50.pdf", + "compress75.pdf", + "compress85.pdf", + "resize_indirect.pdf", + "compress1.jpg", + "compress10.jpg", + "compress50.jpg", + "compress75.jpg", + "compress100.jpg", + "compress_indirect.jpg", + "resize250x500.jpg", + "resize500x250.jpg", + "resize500xnull.jpg", + "resize_indirect.jpg", + "resizenullx250.jpg", + ] + ) diff --git a/tests/utils.py b/tests/utils.py index 252a699c..79948522 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,3 +1,4 @@ +import os from difflib import SequenceMatcher from pathlib import Path @@ -9,6 +10,7 @@ ROOT_DATA_DIR = Path(__file__).parent / "data" FILE_TYPES_DIR = ROOT_DATA_DIR / "file_types" +OUTPUT_DIR = ROOT_DATA_DIR / "output" V1_DATA_DIR = ROOT_DATA_DIR / "v1" V1_ERROR_DATA_DIR = V1_DATA_DIR / "errors" @@ -44,3 +46,10 @@ def levenshtein_ratio(ref_str: str, target_str: str) -> float: :return: Ratio between the two strings """ return SequenceMatcher(None, ref_str, target_str).ratio() + + +def cleanup_output_files(created_files): + for file_path in created_files: + full_path = OUTPUT_DIR / file_path + if full_path.exists(): + os.remove(full_path) diff --git a/tests/v1/extraction/test_image_extractor.py b/tests/v1/extraction/test_image_extractor.py index 6416802c..87147ca3 100644 --- a/tests/v1/extraction/test_image_extractor.py +++ b/tests/v1/extraction/test_image_extractor.py @@ -35,9 +35,9 @@ def test_barcode_image_extraction(barcode_path, barcode_json_path): assert len(extracted_barcodes_1d) == 1 assert len(extracted_barcodes_2d) == 2 - assert extracted_barcodes_1d[0].as_source().filename.endswith("jpg") + assert extracted_barcodes_1d[0].as_input_source().filename.endswith("jpg") assert Image.open(extracted_barcodes_1d[0].buffer).size == (353, 200) assert Image.open(extracted_barcodes_2d[0].buffer).size == (214, 216) - assert extracted_barcodes_2d[0].as_source().filename.endswith("jpg") - assert extracted_barcodes_2d[1].as_source().filename.endswith("jpg") + assert extracted_barcodes_2d[0].as_input_source().filename.endswith("jpg") + assert extracted_barcodes_2d[1].as_input_source().filename.endswith("jpg") assert Image.open(extracted_barcodes_2d[1].buffer).size == (193, 201) diff --git a/tests/v1/input/test_url_input_source_integration.py b/tests/v1/input/test_url_input_source_integration.py index 6d6616bd..82c81b11 100644 --- a/tests/v1/input/test_url_input_source_integration.py +++ b/tests/v1/input/test_url_input_source_integration.py @@ -5,6 +5,7 @@ from mindee import Client from mindee.product.invoice import InvoiceV4 +from tests.utils import cleanup_output_files @pytest.fixture @@ -55,14 +56,8 @@ def test_save_file_with_filename(client, reference_file_path, output_file_path): @pytest.fixture(autouse=True) -def cleanup(request, output_file_path: Path): +def cleanup(request): def remove_test_files(): - generated_files = [ - Path.resolve(output_file_path / "invoice_5p.pdf"), - Path.resolve(output_file_path / "customFileName.pdf"), - ] - for filepath in generated_files: - if os.path.exists(filepath): - os.remove(filepath) + cleanup_output_files(["invoice_5p.pdf", "customFileName.pdf"]) request.addfinalizer(remove_test_files) diff --git a/tests/v2/file_operations/__init__.py b/tests/v2/file_operations/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/v2/file_operations/test_crop_operation.py b/tests/v2/file_operations/test_crop_operation.py new file mode 100644 index 00000000..24fb928c --- /dev/null +++ b/tests/v2/file_operations/test_crop_operation.py @@ -0,0 +1,64 @@ +import json + +import pytest +from PIL import Image + +from mindee.v2.file_operations.crop import extract_crops +from mindee.input.sources.path_input import PathInput +from mindee.v2.product.crop.crop_response import ( + CropResponse, +) +from tests.utils import V2_PRODUCT_DATA_DIR + + +@pytest.fixture +def crops_single_page_path(): + return V2_PRODUCT_DATA_DIR / "crop" / "default_sample.jpg" + + +@pytest.fixture +def crops_multi_page_path(): + return V2_PRODUCT_DATA_DIR / "crop" / "multipage_sample.pdf" + + +@pytest.fixture +def crops_single_page_json_path(): + return V2_PRODUCT_DATA_DIR / "crop" / "crop_single.json" + + +@pytest.fixture +def crops_multi_page_json_path(): + return V2_PRODUCT_DATA_DIR / "crop" / "crop_multiple.json" + + +def test_single_page_crop_split(crops_single_page_path, crops_single_page_json_path): + input_sample = PathInput(crops_single_page_path) + with open(crops_single_page_json_path, "rb") as f: + response = json.load(f) + doc = CropResponse(response) + extracted_crops = extract_crops(input_sample, doc.inference.result.crops) + assert len(extracted_crops) == 1 + + assert extracted_crops[0].page_id == 0 + assert extracted_crops[0].element_id == 0 + image_buffer_0 = Image.open(extracted_crops[0].buffer) + assert image_buffer_0.size == (2823, 1571) + + +def test_multi_page_receipt_split(crops_multi_page_path, crops_multi_page_json_path): + input_sample = PathInput(crops_multi_page_path) + with open(crops_multi_page_json_path, "rb") as f: + response = json.load(f) + doc = CropResponse(response) + extracted_crops = extract_crops(input_sample, doc.inference.result.crops) + assert len(extracted_crops) == 2 + + assert extracted_crops[0].page_id == 0 + assert extracted_crops[0].element_id == 0 + image_buffer_0 = Image.open(extracted_crops[0].buffer) + assert image_buffer_0.size == (156, 758) + + assert extracted_crops[1].page_id == 0 + assert extracted_crops[1].element_id == 1 + image_buffer_1 = Image.open(extracted_crops[1].buffer) + assert image_buffer_1.size == (187, 690) diff --git a/tests/v2/file_operations/test_crop_operation_integration.py b/tests/v2/file_operations/test_crop_operation_integration.py new file mode 100644 index 00000000..a7541a32 --- /dev/null +++ b/tests/v2/file_operations/test_crop_operation_integration.py @@ -0,0 +1,61 @@ +import os +from os import getenv + +import pytest + +from mindee import ( + ClientV2, + InferenceParameters, + InferenceResponse, + CropParameters, + CropResponse, +) +from mindee.input.sources.path_input import PathInput +from mindee.v2.file_operations.crop import extract_crops +from tests.utils import OUTPUT_DIR, V2_PRODUCT_DATA_DIR, cleanup_output_files + + +@pytest.fixture +def crop_sample(): + return V2_PRODUCT_DATA_DIR / "crop" / "default_sample.jpg" + + +def check_findoc_return(findoc_response: InferenceResponse): + assert len(findoc_response.inference.model.id) > 0 + assert findoc_response.inference.result.fields.get("total_amount").value > 0 + + +@pytest.mark.integration +def test_image_should_extract_crops(): + client = ClientV2() + crop_input = PathInput(V2_PRODUCT_DATA_DIR / "crop" / "default_sample.jpg") + response = client.enqueue_and_get_result( + CropResponse, + crop_input, + CropParameters(getenv("MINDEE_V2_SE_TESTS_CROP_MODEL_ID"), close_file=False), + ) + assert len(response.inference.result.crops) == 2 + + extracted_images = extract_crops(crop_input, response.inference.result.crops) + + assert len(extracted_images) == 2 + assert extracted_images[0].filename == "default_sample.jpg_page1-0.jpg" + assert extracted_images[1].filename == "default_sample.jpg_page1-1.jpg" + + invoice_0 = client.enqueue_and_get_result( + InferenceResponse, + extracted_images[0].as_input_source(), + InferenceParameters( + getenv("MINDEE_V2_SE_TESTS_FINDOC_MODEL_ID"), close_file=False + ), + ) + check_findoc_return(invoice_0) + extracted_images.save_all_to_disk(OUTPUT_DIR) + assert os.path.getsize(OUTPUT_DIR / "crop_001.jpg") == 198887 + assert os.path.getsize(OUTPUT_DIR / "crop_002.jpg") == 197443 + + +@pytest.fixture(scope="module", autouse=True) +def cleanup(): + yield + cleanup_output_files(["crop_001.jpg", "crop_002.jpg"]) diff --git a/tests/v2/file_operations/test_split_operation.py b/tests/v2/file_operations/test_split_operation.py new file mode 100644 index 00000000..8e70b5ac --- /dev/null +++ b/tests/v2/file_operations/test_split_operation.py @@ -0,0 +1,56 @@ +import json + +import pytest + +from mindee.v2.file_operations.split import extract_splits +from mindee.input.sources.path_input import PathInput +from mindee.v2.product.split.split_response import ( + SplitResponse, +) +from tests.utils import V2_PRODUCT_DATA_DIR + + +@pytest.fixture +def splits_default(): + return ( + V2_PRODUCT_DATA_DIR / "extraction" / "financial_document" / "default_sample.jpg" + ) + + +@pytest.fixture +def splits_5p(): + return V2_PRODUCT_DATA_DIR / "split" / "invoice_5p.pdf" + + +@pytest.fixture +def splits_single_page_json_path(): + return V2_PRODUCT_DATA_DIR / "split" / "split_single.json" + + +@pytest.fixture +def splits_multi_page_json_path(): + return V2_PRODUCT_DATA_DIR / "split" / "split_multiple.json" + + +def test_single_page_split_split(splits_default, splits_single_page_json_path): + input_sample = PathInput(splits_default) + with open(splits_single_page_json_path, "rb") as f: + response = json.load(f) + doc = SplitResponse(response) + extracted_splits = extract_splits(input_sample, doc.inference.result.splits) + assert len(extracted_splits) == 1 + + assert extracted_splits[0].get_page_count() == 1 + + +def test_multi_page_receipt_split(splits_5p, splits_multi_page_json_path): + input_sample = PathInput(splits_5p) + with open(splits_multi_page_json_path, "rb") as f: + response = json.load(f) + doc = SplitResponse(response) + extracted_splits = extract_splits(input_sample, doc.inference.result.splits) + assert len(extracted_splits) == 3 + + assert extracted_splits[0].get_page_count() == 1 + assert extracted_splits[1].get_page_count() == 3 + assert extracted_splits[2].get_page_count() == 1 diff --git a/tests/v2/file_operations/test_split_operation_integration.py b/tests/v2/file_operations/test_split_operation_integration.py new file mode 100644 index 00000000..9fba4ad9 --- /dev/null +++ b/tests/v2/file_operations/test_split_operation_integration.py @@ -0,0 +1,68 @@ +from os import getenv + +import pytest + +from mindee import ( + ClientV2, + InferenceParameters, + InferenceResponse, + SplitParameters, + SplitResponse, +) +from mindee.input.sources.path_input import PathInput +from mindee.v2.file_operations.split import extract_splits +from tests.utils import OUTPUT_DIR, V2_PRODUCT_DATA_DIR, cleanup_output_files + + +@pytest.fixture +def invoice_splitter_5p_path(): + return V2_PRODUCT_DATA_DIR / "split" / "invoice_5p.pdf" + + +def check_findoc_return(findoc_response: InferenceResponse): + assert len(findoc_response.inference.model.id) > 0 + assert findoc_response.inference.result.fields.get("total_amount").value > 0 + + +@pytest.mark.integration +def test_pdf_should_extract_splits(): + client = ClientV2() + split_input = PathInput(V2_PRODUCT_DATA_DIR / "split" / "default_sample.pdf") + response = client.enqueue_and_get_result( + SplitResponse, + split_input, + SplitParameters( + getenv("MINDEE_V2_SE_TESTS_SPLIT_MODEL_ID"), + close_file=False, + ), + ) + assert response.inference.file.page_count == 2 + + extracted_pdfs = extract_splits(split_input, response.inference.result.splits) + + assert len(extracted_pdfs) == 2 + assert extracted_pdfs[0].filename == "default_sample_001-001.pdf" + assert extracted_pdfs[1].filename == "default_sample_002-002.pdf" + + invoice_0 = client.enqueue_and_get_result( + InferenceResponse, + extracted_pdfs[0].as_input_source(), + InferenceParameters( + getenv("MINDEE_V2_SE_TESTS_FINDOC_MODEL_ID"), close_file=False + ), + ) + check_findoc_return(invoice_0) + extracted_pdfs.save_all_to_disk(OUTPUT_DIR) + for i in range(len(extracted_pdfs)): + local_input = PathInput(OUTPUT_DIR / f"split_{i + 1:03d}.pdf") + try: + assert local_input.page_count == extracted_pdfs[i].get_page_count() + finally: + local_input.close() + split_input.close() + + +@pytest.fixture(scope="module", autouse=True) +def cleanup(): + yield + cleanup_output_files(["split_001.pdf", "split_002.pdf"]) diff --git a/tests/v2/product/crop/test_crop_response.py b/tests/v2/product/crop/test_crop_response.py index 635a89ea..98832745 100644 --- a/tests/v2/product/crop/test_crop_response.py +++ b/tests/v2/product/crop/test_crop_response.py @@ -64,6 +64,6 @@ def test_crop_multiple(): assert response.inference.result.crops[1].location.polygon[3][0] == 0.547 assert response.inference.result.crops[1].location.polygon[3][1] == 0.97 assert response.inference.result.crops[1].location.page == 0 - assert response.inference.result.crops[1].object_type == "invoice" + assert response.inference.result.crops[1].object_type == "receipt" assert rst_sample == str(response) diff --git a/tests/v2/product/split/test_split_response.py b/tests/v2/product/split/test_split_response.py index 4ce2ad8b..29381469 100644 --- a/tests/v2/product/split/test_split_response.py +++ b/tests/v2/product/split/test_split_response.py @@ -32,7 +32,7 @@ def test_split_multiple(): assert len(response.inference.result.splits[0].page_range) == 2 assert response.inference.result.splits[0].page_range[0] == 0 assert response.inference.result.splits[0].page_range[1] == 0 - assert response.inference.result.splits[0].document_type == "invoice" + assert response.inference.result.splits[0].document_type == "passport" assert len(response.inference.result.splits[1].page_range) == 2 assert response.inference.result.splits[1].page_range[0] == 1 @@ -42,4 +42,4 @@ def test_split_multiple(): assert len(response.inference.result.splits[2].page_range) == 2 assert response.inference.result.splits[2].page_range[0] == 4 assert response.inference.result.splits[2].page_range[1] == 4 - assert response.inference.result.splits[2].document_type == "invoice" + assert response.inference.result.splits[2].document_type == "receipt" diff --git a/tests/v2/test_client.py b/tests/v2/test_client.py index a4e87685..5eb0d3fa 100644 --- a/tests/v2/test_client.py +++ b/tests/v2/test_client.py @@ -6,7 +6,8 @@ from mindee import ClientV2, InferenceParameters, InferenceResponse, LocalResponse from mindee.error.mindee_error import MindeeApiV2Error, MindeeError from mindee.error.mindee_http_error_v2 import MindeeHTTPErrorV2 -from mindee.input import LocalInputSource, PathInput +from mindee.input.sources.local_input_source import LocalInputSource +from mindee.input.sources.path_input import PathInput from mindee.mindee_http.base_settings import USER_AGENT from mindee.parsing.v2.inference import Inference from mindee.parsing.v2.job import Job