diff --git a/CHANGELOG.md b/CHANGELOG.md index 1d77591d..18b42cf4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -72,7 +72,7 @@ Fix syntax for generated HTML tables ## 0.7.22 -* fix: add logic to handle computation of intersections betwen 2 `Rectangle`s when a `Rectangle` has `None` value in its coordinates +* fix: add logic to handle computation of intersections between 2 `Rectangle`s when a `Rectangle` has `None` value in its coordinates ## 0.7.21 @@ -111,8 +111,8 @@ Fix syntax for generated HTML tables * refactor: add a class `ElementType` for the element type constants and use the constants to replace element type strings * enhancement: support extracting elements with types `Picture` and `Figure` -* fix: update logger in table initalization where the logger info was not showing -* chore: supress UserWarning about specified model providers +* fix: update logger in table initialization where the logger info was not showing +* chore: suppress UserWarning about specified model providers ## 0.7.12 @@ -215,7 +215,7 @@ we have the mapping from standard language code to paddle language code. ## 0.6.0 -* add a config class to handle parameter configurations for inference tasks; parameters in the config class can be set via environement variables +* add a config class to handle parameter configurations for inference tasks; parameters in the config class can be set via environment variables * update behavior of `pad_image_with_background_color` so that input `pad` is applied to all sides ## 0.5.31 @@ -256,7 +256,7 @@ we have the mapping from standard language code to paddle language code. ## 0.5.21 -* adds `safe_division` to replae 0 with machine epsilon for `float` to avoid division by 0 +* adds `safe_division` to replace 0 with machine epsilon for `float` to avoid division by 0 * apply `safe_division` to area overlap calculations in `unstructured_inference/inference/elements.py` ## 0.5.20 @@ -346,7 +346,7 @@ we have the mapping from standard language code to paddle language code. * Added functionality to convert a PDF in small chunks of pages at a time for `pdf2image.convert_from_path` * Table processing check for the area of the package to fix division by zero bug * Added CUDA and TensorRT execution providers for yolox and detectron2onnx model. -* Warning for onnx version of detectron2 for empty pages suppresed. +* Warning for onnx version of detectron2 for empty pages suppressed. ## 0.5.4 diff --git a/README.md b/README.md index ac759757..fdb502c1 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ Run `pip install unstructured-inference`. [Detectron2](https://github.com/facebookresearch/detectron2) is required for using models from the [layoutparser model zoo](#using-models-from-the-layoutparser-model-zoo) but is not automatically installed with this package. -For MacOS and Linux, build from source with: +For macOS and Linux, build from source with: ```shell pip install 'git+https://github.com/facebookresearch/detectron2.git@57bdb21249d5418c130d54e2ebdc94dda7a4c01a' ``` @@ -89,6 +89,6 @@ information on how to report security vulnerabilities. | Section | Description | |-|-| -| [Unstructured Community Github](https://github.com/Unstructured-IO/community) | Information about Unstructured.io community projects | -| [Unstructured Github](https://github.com/Unstructured-IO) | Unstructured.io open source repositories | +| [Unstructured Community GitHub](https://github.com/Unstructured-IO/community) | Information about Unstructured.io community projects | +| [Unstructured GitHub](https://github.com/Unstructured-IO) | Unstructured.io open source repositories | | [Company Website](https://unstructured.io) | Unstructured.io product and company info | diff --git a/test_unstructured_inference/models/test_chippermodel.py b/test_unstructured_inference/models/test_chippermodel.py index c68aa6bc..ad51dc53 100644 --- a/test_unstructured_inference/models/test_chippermodel.py +++ b/test_unstructured_inference/models/test_chippermodel.py @@ -190,11 +190,11 @@ def test_no_repeat_ngram_logits(): ) -def test_ngram_repetiton_stopping_criteria(): +def test_ngram_repetition_stopping_criteria(): input_ids = torch.tensor([[1, 2, 3, 4, 0, 1, 2, 3, 4]]) logits = torch.tensor([[0.1, -0.3, -0.5, 0, 1.0, -0.9]]) - stoppingCriteria = chipper.NGramRepetitonStoppingCriteria( + stoppingCriteria = chipper.NGramRepetitionStoppingCriteria( repetition_window=2, skip_tokens={0, 1, 2, 3, 4} ) @@ -202,7 +202,7 @@ def test_ngram_repetiton_stopping_criteria(): assert output is False - stoppingCriteria = chipper.NGramRepetitonStoppingCriteria( + stoppingCriteria = chipper.NGramRepetitionStoppingCriteria( repetition_window=2, skip_tokens={1, 2, 3, 4} ) output = stoppingCriteria(input_ids=input_ids, scores=logits) @@ -259,7 +259,7 @@ def test_postprocess_bbox(decoded_str, expected_classes): def test_predict_tokens_beam_indices(): model = get_model("chipper") model.stopping_criteria = [ - chipper.NGramRepetitonStoppingCriteria( + chipper.NGramRepetitionStoppingCriteria( repetition_window=1, skip_tokens={}, ), diff --git a/test_unstructured_inference/models/test_tables.py b/test_unstructured_inference/models/test_tables.py index 15c467cd..4c0d8155 100644 --- a/test_unstructured_inference/models/test_tables.py +++ b/test_unstructured_inference/models/test_tables.py @@ -927,7 +927,7 @@ def test_table_prediction_output_format( assert expectation in result.values elif output_format == "cells": # other output like bbox are flakey to test since they depend on OCR and it may change - # slightly when OCR pacakge changes or even on different machines + # slightly when OCR package changes or even on different machines validation_fields = ("column_nums", "row_nums", "column header", "cell text") assert expectation in [{key: cell[key] for key in validation_fields} for cell in result] else: @@ -1763,11 +1763,11 @@ def test_padded_results_has_right_dimensions(table_transformer, example_image): pad = int(min(example_image.size) / 10) structure = table_transformer.get_structure(example_image, pad_for_structure_detection=pad) - # boxes deteced OUTSIDE of the original image; this shouldn't happen but we want to make sure + # boxes detected OUTSIDE of the original image; this shouldn't happen but we want to make sure # the code handles it as expected structure["pred_boxes"][0][0, :2] = 0.5 structure["pred_boxes"][0][0, 2:] = 1.0 - # mock a box we know are safly inside the original image with known positions + # mock a box we know are safely inside the original image with known positions width, height = example_image.size padded_width = width + pad * 2 padded_height = height + pad * 2 diff --git a/test_unstructured_inference/test_utils.py b/test_unstructured_inference/test_utils.py index 399ca739..874a13e3 100644 --- a/test_unstructured_inference/test_utils.py +++ b/test_unstructured_inference/test_utils.py @@ -77,7 +77,7 @@ def test_pad_image_with_background_color(mock_pil_image): def test_pad_image_with_invalid_input(mock_pil_image): - with pytest.raises(ValueError, match="Can not pad an image with negative space!"): + with pytest.raises(ValueError, match="Cannot pad an image with negative space!"): pad_image_with_background_color(mock_pil_image, -1) diff --git a/unstructured_inference/inference/layoutelement.py b/unstructured_inference/inference/layoutelement.py index 37a9ef24..2b2c6a1d 100644 --- a/unstructured_inference/inference/layoutelement.py +++ b/unstructured_inference/inference/layoutelement.py @@ -178,7 +178,7 @@ def separate(region_a: Rectangle, region_b: Rectangle): """Reduce leftmost rectangle to don't overlap with the other""" def reduce(keep: Rectangle, reduce: Rectangle): - # Asume intersection + # Assume intersection # Other is down if reduce.y2 > keep.y2 and reduce.x1 < keep.x2: diff --git a/unstructured_inference/logger.py b/unstructured_inference/logger.py index c9645ac1..c0b69384 100644 --- a/unstructured_inference/logger.py +++ b/unstructured_inference/logger.py @@ -2,7 +2,7 @@ def translate_log_level(level: int) -> int: - """Translate Python debugg level to ONNX runtime error level + """Translate Python debug level to ONNX runtime error level since blank pages error are shown at level 3 that should be the exception, and 4 the normal behavior""" level_name = logging.getLevelName(level) diff --git a/unstructured_inference/models/chipper.py b/unstructured_inference/models/chipper.py index 857c83e9..4f9305e8 100644 --- a/unstructured_inference/models/chipper.py +++ b/unstructured_inference/models/chipper.py @@ -102,7 +102,7 @@ def initialize( ] self.stopping_criteria = [ - NGramRepetitonStoppingCriteria( + NGramRepetitionStoppingCriteria( repetition_window=30, skip_tokens=get_table_token_ids(self.processor), ), @@ -137,7 +137,7 @@ def initialize( else: if swap_head_hidden_layer_size is not None: logger.warning( - f"swap_head is False but recieved value {swap_head_hidden_layer_size} for " + f"swap_head is False but received value {swap_head_hidden_layer_size} for " "swap_head_hidden_layer_size, which will be ignored.", ) @@ -658,7 +658,7 @@ def reduce_bbox_overlap( input_bbox: List[float], ) -> List[float]: """ - If an element does overlap with other elements, reduce bouding box by selecting the largest + If an element does overlap with other elements, reduce bounding box by selecting the largest bbox after blurring existing text """ input_bbox = [int(b) for b in input_bbox] @@ -1027,7 +1027,7 @@ def __call__( ) -class NGramRepetitonStoppingCriteria(StoppingCriteria): +class NGramRepetitionStoppingCriteria(StoppingCriteria): def __init__(self, repetition_window: int, skip_tokens: set = set()): self.repetition_window = repetition_window self.skip_tokens = skip_tokens diff --git a/unstructured_inference/models/detectron2onnx.py b/unstructured_inference/models/detectron2onnx.py index 79cd0a1a..f9d87b6f 100644 --- a/unstructured_inference/models/detectron2onnx.py +++ b/unstructured_inference/models/detectron2onnx.py @@ -48,7 +48,7 @@ "model_path": os.path.join( HUGGINGFACE_HUB_CACHE, "detectron2_quantized", - "detectrin2_quantized.onnx", + "detectron2_quantized.onnx", ), "label_map": DEFAULT_LABEL_MAP, "confidence_threshold": 0.8, @@ -131,7 +131,7 @@ def preprocess(self, image: Image.Image) -> Dict[str, np.ndarray]: """ # TODO (benjamin): check other shapes for inference img = np.array(image) - # TODO (benjamin): We should use models.get_model() but currenly returns Detectron model + # TODO (benjamin): We should use models.get_model() but currently returns Detectron model session = self.model # onnx input expected # [3,1035,800] diff --git a/unstructured_inference/models/tables.py b/unstructured_inference/models/tables.py index c390378e..655746b7 100644 --- a/unstructured_inference/models/tables.py +++ b/unstructured_inference/models/tables.py @@ -84,7 +84,7 @@ def get_structure( x: PILImage.Image, pad_for_structure_detection: int = inference_config.TABLE_IMAGE_BACKGROUND_PAD, ) -> dict: - """get the table structure as a dictionary contaning different types of elements as + """get the table structure as a dictionary containing different types of elements as key-value pairs; check table-transformer documentation for more information""" with torch.no_grad(): encoding = self.feature_extractor( diff --git a/unstructured_inference/models/yolox.py b/unstructured_inference/models/yolox.py index 0acd93f3..5c6041a1 100644 --- a/unstructured_inference/models/yolox.py +++ b/unstructured_inference/models/yolox.py @@ -91,7 +91,7 @@ def image_processing( self, image: PILImage.Image, ) -> List[LayoutElement]: - """Method runing YoloX for layout detection, returns a PageLayout + """Method running YoloX for layout detection, returns a PageLayout parameters ---------- page @@ -99,7 +99,7 @@ def image_processing( origin_img If specified, an Image object for process with YoloX model page_number - Number asigned to the PageLayout returned + Number assigned to the PageLayout returned output_directory Boolean indicating if result will be stored """ @@ -125,7 +125,7 @@ def image_processing( boxes_xyxy[:, 3] = boxes[:, 1] + boxes[:, 3] / 2.0 boxes_xyxy /= ratio - # Note (Benjamin): Distinct models (quantized and original) requires distincts + # Note (Benjamin): Distinct models (quantized and original) requires distinct # levels of thresholds if "quantized" in self.model_path: dets = multiclass_nms(boxes_xyxy, scores, nms_thr=0.0, score_thr=0.07) diff --git a/unstructured_inference/utils.py b/unstructured_inference/utils.py index 696a2e8a..46affad1 100644 --- a/unstructured_inference/utils.py +++ b/unstructured_inference/utils.py @@ -50,7 +50,7 @@ def __len__(self) -> int: def tag(elements: Iterable[LayoutElement]): - """Asign an numeric id to the elements in the list. + """Assign a numeric id to the elements in the list. Useful for debugging""" colors = ["red", "blue", "green", "magenta", "brown"] for i, e in enumerate(elements): @@ -72,7 +72,7 @@ def pad_image_with_background_color( width, height = image.size if pad < 0: raise ValueError( - "Can not pad an image with negative space! Please use a positive value for `pad`.", + "Cannot pad an image with negative space! Please use a positive value for `pad`.", ) new = Image.new(image.mode, (width + pad * 2, height + pad * 2), background_color) new.paste(image, (pad, pad))