Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/cron.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@
uses: mindee/mindee-api-python/.github/workflows/_test-regressions.yml@main
secrets: inherit
test-code-samples:
uses: mindee/mindee-api-python/.github/workflows/_smoke_test.yml@main
uses: mindee/mindee-api-python/.github/workflows/_smoke-test.yml@main
secrets: inherit

Check warning

Code scanning / CodeQL

Workflow does not contain permissions Medium

Actions job or workflow does not limit the permissions of the GITHUB_TOKEN. Consider setting an explicit permissions block, using the following as a minimal starting point: {}
4 changes: 3 additions & 1 deletion examples/auto_multi_receipts_extraction_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@ def parse_receipts(input_path):
extracted_receipts = extract_receipts(input_doc, result_split.document.inference)

for idx, receipt in enumerate(extracted_receipts, 1):
result_receipt = mindee_client.parse(product.ReceiptV5, receipt.as_source())
result_receipt = mindee_client.parse(
product.ReceiptV5, receipt.as_input_source()
)
print(f"Receipt {idx}:")
print(result_receipt.document)
print("-" * 40)
Expand Down
26 changes: 19 additions & 7 deletions mindee/extraction/common/extracted_image.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import io
from pathlib import Path
from typing import Optional
from typing import Optional, Union

from PIL import Image

Expand All @@ -17,6 +17,8 @@ class ExtractedImage:
"""Id of the page the image was extracted from."""
_element_id: int
"""Id of the element on a given page."""
filename: str
"""Name of the file the image was extracted from."""

def __init__(
self, input_source: LocalInputSource, page_id: int, element_id: int
Expand All @@ -30,6 +32,7 @@ def __init__(
"""
self.buffer = io.BytesIO(input_source.file_object.read())
self.buffer.name = input_source.filename
self.filename = input_source.filename
if input_source.is_pdf():
extension = "jpg"
else:
Expand All @@ -43,7 +46,9 @@ def __init__(
self._page_id = page_id
self._element_id = 0 if element_id is None else element_id

def save_to_file(self, output_path: str, file_format: Optional[str] = None):
def save_to_file(
self, output_path: Union[Path, str], file_format: Optional[str] = None
):
"""
Saves the document to a file.

Expand All @@ -56,20 +61,27 @@ def save_to_file(self, output_path: str, file_format: Optional[str] = None):
if not file_format:
if len(resolved_path.suffix) < 1:
raise ValueError("Invalid file format.")
file_format = (
resolved_path.suffix.upper()
) # technically redundant since PIL applies an upper operation
# to the parameter , but older versions may not do so.
# Let PIL infer format from filename extension
self.buffer.seek(0)
image = Image.open(self.buffer)
image.save(resolved_path, format=file_format)
if file_format:
image.save(resolved_path, format=file_format)
else:
image.save(resolved_path)
logger.info("File saved successfully to '%s'.", resolved_path)
except TypeError as exc:
raise MindeeError("Invalid path/filename provided.") from exc
except Exception as exc:
print(exc)
raise MindeeError(f"Could not save file {Path(output_path).name}.") from exc

def as_source(self) -> FileInput:
"""
Deprecated. Use ``as_input_source`` instead.
"""
return self.as_input_source()

def as_input_source(self) -> FileInput:
"""
Return the file as a Mindee-compatible BufferInput source.

Expand Down
8 changes: 5 additions & 3 deletions mindee/extraction/common/image_extractor.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import io
from typing import BinaryIO, List
from typing import BinaryIO, List, Union

import pypdfium2 as pdfium
from PIL import Image

from mindee.error.mindee_error import MindeeError
from mindee.extraction.common.extracted_image import ExtractedImage
from mindee.geometry.point import Point
from mindee.geometry.polygon import get_min_max_x, get_min_max_y
from mindee.geometry.polygon import Polygon, get_min_max_x, get_min_max_y
from mindee.input.sources.bytes_input import BytesInput
from mindee.input.sources.local_input_source import LocalInputSource

Expand Down Expand Up @@ -114,7 +114,9 @@ def get_file_extension(file_format: str):


def extract_multiple_images_from_source(
input_source: LocalInputSource, page_id: int, polygons: List[List[Point]]
input_source: LocalInputSource,
page_id: int,
polygons: List[Union[Polygon, List[Point]]],
) -> List[ExtractedImage]:
"""
Extracts elements from a page based on a list of bounding boxes.
Expand Down
7 changes: 6 additions & 1 deletion mindee/extraction/pdf_extractor/extracted_pdf.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from pathlib import Path
from typing import BinaryIO
from typing import BinaryIO, Union

import pypdfium2 as pdfium

Expand Down Expand Up @@ -28,6 +28,10 @@ def get_page_count(self) -> int:
) from exc

def write_to_file(self, output_path: str):
"""Deprecated. Use ``save_to_file`` instead."""
self.save_to_file(output_path)

def save_to_file(self, output_path: Union[Path, str]):
"""
Writes the contents of the current PDF object to a file.

Expand All @@ -40,6 +44,7 @@ def write_to_file(self, output_path: str):
raise MindeeError("Invalid save path provided {}.")
if out_path.suffix.lower() != "pdf":
out_path = out_path.parent / (out_path.stem + "." + "pdf")
self.pdf_bytes.seek(0)
with open(out_path, "wb") as out_file:
out_file.write(self.pdf_bytes.read())

Expand Down
4 changes: 3 additions & 1 deletion mindee/mindee_http/mindee_api_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
import requests

from mindee.error.mindee_error import MindeeApiV2Error
from mindee.input import LocalInputSource, UrlInputSource, BaseParameters
from mindee.input.base_parameters import BaseParameters
from mindee.input.sources.local_input_source import LocalInputSource
from mindee.input.sources.url_input_source import UrlInputSource
from mindee.logger import logger
from mindee.mindee_http.base_settings import USER_AGENT
from mindee.mindee_http.settings_mixin import SettingsMixin
Expand Down
4 changes: 3 additions & 1 deletion mindee/mindee_http/workflow_endpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@

import requests

from mindee.input import LocalInputSource, UrlInputSource, WorkflowOptions
from mindee.input.sources.local_input_source import LocalInputSource
from mindee.input.sources.url_input_source import UrlInputSource
from mindee.input.workflow_options import WorkflowOptions
from mindee.mindee_http.base_endpoint import BaseEndpoint
from mindee.mindee_http.workflow_settings import WorkflowSettings

Expand Down
9 changes: 9 additions & 0 deletions mindee/v2/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
from mindee.v2.file_operations.crop import (
extract_crops,
extract_single_crop,
)
from mindee.v2.file_operations.split import extract_splits
from mindee.v2.product.classification.classification_parameters import (
ClassificationParameters,
)
Expand All @@ -12,6 +17,10 @@
from mindee.v2.product.split.split_response import SplitResponse

__all__ = [
"extract_crops",
"extract_splits",
"extract_crops",
"extract_single_crop",
"ClassificationResponse",
"ClassificationParameters",
"CropResponse",
Expand Down
7 changes: 7 additions & 0 deletions mindee/v2/file_operations/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from mindee.v2.file_operations.crop import (
extract_crops,
extract_single_crop,
)
from mindee.v2.file_operations.split import extract_splits

__all__ = ["extract_crops", "extract_splits", "extract_crops", "extract_single_crop"]
51 changes: 51 additions & 0 deletions mindee/v2/file_operations/crop.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
from typing import List, Union

from mindee.error import MindeeError
from mindee.extraction import ExtractedImage, extract_multiple_images_from_source
from mindee.geometry import Point, Polygon
from mindee.input.sources.local_input_source import LocalInputSource
from mindee.parsing.v2.field import FieldLocation
from mindee.v2.file_operations.crop_files import CropFiles
from mindee.v2.product.crop.crop_box import CropBox


def extract_single_crop(
input_source: LocalInputSource, crop: FieldLocation
) -> ExtractedImage:
"""
Extracts a single crop as complete PDFs from the document.

:param input_source: Local Input Source to extract sub-receipts from.
:param crop: Crop to extract.
:return: ExtractedImage.
"""

polygons: List[Union[Polygon, List[Point]]] = [crop.polygon]
return extract_multiple_images_from_source(input_source, crop.page, polygons)[0]


def extract_crops(input_source: LocalInputSource, crops: List[CropBox]) -> CropFiles:
"""
Extracts individual receipts from multi-receipts documents.

:param input_source: Local Input Source to extract sub-receipts from.
:param crops: List of crops.
:return: Individual extracted receipts as an array of ExtractedImage.
"""
images: List[ExtractedImage] = []
if not crops:
raise MindeeError("No possible candidates found for Crop extraction.")
polygons: List[List[Union[Polygon, List[Point]]]] = [
[] for _ in range(input_source.page_count)
]
for i, crop in enumerate(crops):
polygons[crop.location.page].append(crop.location.polygon)
for i, polygon in enumerate(polygons):
images.extend(
extract_multiple_images_from_source(
input_source,
i,
polygon,
)
)
return CropFiles(images)
20 changes: 20 additions & 0 deletions mindee/v2/file_operations/crop_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from pathlib import Path
from typing import List, Union

from mindee.extraction import ExtractedImage


class CropFiles(List[ExtractedImage]):
"""Crop files."""

def save_all_to_disk(self, path: Union[Path, str]):
"""
Save all extracted crops to disk.

:param path: Path to save the extracted splits to
"""
if isinstance(path, str):
path = Path(path)
path.mkdir(parents=True, exist_ok=True)
for idx, split in enumerate(self, start=1):
split.save_to_file(path / f"crop_{idx:03}.jpg")
33 changes: 33 additions & 0 deletions mindee/v2/file_operations/split.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from typing import List, Union

from mindee.error import MindeeError
from mindee.extraction import PdfExtractor
from mindee.input.sources.local_input_source import LocalInputSource
from mindee.v2.file_operations.split_files import SplitFiles
from mindee.v2.product.split.split_range import SplitRange


def extract_splits(
input_source: LocalInputSource,
splits: Union[List[SplitRange], List[List[int]]],
) -> SplitFiles:
"""
Extracts splits as complete PDFs from the document.

:param input_source: Input source to split.
:param splits: List of sub-lists of pages to keep.
:return: A list of extracted invoices.
"""
pdf_extractor = PdfExtractor(input_source)
page_groups = []
for split in splits:
if isinstance(split, SplitRange):
lower_bound = split.page_range[0]
upper_bound = split.page_range[1]
else:
lower_bound = split[0]
upper_bound = split[1]
page_groups.append(list(range(lower_bound, upper_bound + 1)))
if len(splits) < 1:
raise MindeeError("No indexes provided.")
return SplitFiles(pdf_extractor.extract_sub_documents(page_groups))
20 changes: 20 additions & 0 deletions mindee/v2/file_operations/split_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from pathlib import Path
from typing import List, Union

from mindee.extraction.pdf_extractor.extracted_pdf import ExtractedPdf


class SplitFiles(List[ExtractedPdf]):
"""Split files."""

def save_all_to_disk(self, path: Union[str, Path]):
"""
Save all extracted splits to disk.

:param path: Path to save the extracted splits to
"""
if isinstance(path, str):
path = Path(path)
path.mkdir(parents=True, exist_ok=True)
for idx, split in enumerate(self, start=1):
split.save_to_file(path / f"split_{idx:03}.pdf")
13 changes: 13 additions & 0 deletions mindee/v2/product/crop/crop_box.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from mindee.extraction import ExtractedImage, extract_multiple_images_from_source
from mindee.input.sources.local_input_source import LocalInputSource
from mindee.parsing.common.string_dict import StringDict
from mindee.parsing.v2.field.field_location import FieldLocation

Expand All @@ -16,3 +18,14 @@ def __init__(self, server_response: StringDict):

def __str__(self) -> str:
return f"* :Location: {self.location}\n :Object Type: {self.object_type}"

def extract_from_file(self, input_source: LocalInputSource) -> ExtractedImage:
"""
Apply the split range inference to a file and return a single extracted PDF.

:param input_source: Local file to apply the inference to
:return: Extracted PDF
"""
return extract_multiple_images_from_source(
input_source, self.location.page, [self.location.polygon]
)[0]
16 changes: 16 additions & 0 deletions mindee/v2/product/crop/crop_response.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from mindee.input.sources.local_input_source import LocalInputSource
from mindee.parsing.common.string_dict import StringDict
from mindee.v2.file_operations.crop_files import CropFiles
from mindee.v2.parsing.inference import BaseResponse
from mindee.v2.product.crop.crop_inference import CropInference

Expand All @@ -15,3 +17,17 @@ class CropResponse(BaseResponse):
def __init__(self, raw_response: StringDict) -> None:
super().__init__(raw_response)
self.inference = CropInference(raw_response["inference"])

def extract_from_file(self, input_source: LocalInputSource) -> CropFiles:
"""
Apply the crop inference to a file and return a list of extracted images.

:param input_source: Local file to apply the inference to
:return: List of extracted PDFs
"""
return CropFiles(
[
crop.extract_from_file(input_source)
for crop in self.inference.result.crops
]
)
13 changes: 13 additions & 0 deletions mindee/v2/product/split/split_range.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from typing import List

from mindee.extraction.pdf_extractor.extracted_pdf import ExtractedPdf
from mindee.extraction.pdf_extractor.pdf_extractor import PdfExtractor
from mindee.input.sources.local_input_source import LocalInputSource
from mindee.parsing.common.string_dict import StringDict


Expand All @@ -21,3 +24,13 @@ def __init__(self, server_response: StringDict):
def __str__(self) -> str:
page_range = ",".join([str(page_index) for page_index in self.page_range])
return f"* :Page Range: {page_range}\n :Document Type: {self.document_type}"

def extract_from_file(self, input_source: LocalInputSource) -> ExtractedPdf:
"""
Apply the split range inference to a file and return a single extracted PDF.

:param input_source: Local file to apply the inference to
:return: Extracted PDF
"""
pdf_extractor = PdfExtractor(input_source)
return pdf_extractor.extract_sub_documents([self.page_range])[0]
Loading
Loading