diff --git a/pdfplumber/table.py b/pdfplumber/table.py index f189742e..102e48cd 100644 --- a/pdfplumber/table.py +++ b/pdfplumber/table.py @@ -357,12 +357,14 @@ def get_centroids(indices): yield (vertices[index - 1] + vertices[index]) / 2 def get_cell_by_centroid(centroid_x, centroid_y): - return next( + cells = list( filter( lambda c: c[0] < centroid_x < c[2] and c[1] < centroid_y < c[3], self.cells, ) ) + if cells: + return cells[0] x_centroids = list(get_centroids((0, 2))) y_centroids = list(get_centroids((1, 3))) diff --git a/tests/pdfs/issue-420-example.pdf b/tests/pdfs/issue-420-example.pdf index 7ecd215f..e719c289 100644 Binary files a/tests/pdfs/issue-420-example.pdf and b/tests/pdfs/issue-420-example.pdf differ diff --git a/tests/test_table.py b/tests/test_table.py index eb48c905..d31e0e57 100644 --- a/tests/test_table.py +++ b/tests/test_table.py @@ -102,13 +102,13 @@ def test_merge_cell(self): See issue #420 """ path = os.path.join(HERE, "pdfs/issue-420-example.pdf") - TABLE_0 = [ + TABLE_0_0 = [ ["Header 1", "Header 2", "Header 3", "Header 4"], ["Merged cell 1", "Merged cell 2", "Cell 3-1", "Cell 4-1"], ["Merged cell 1", "Merged cell 2", "Cell 3-2", "Cell 4-2"], ["Merged cell 1", "Merged cell 2", "Cell 3-2", "Cell 4-3"], ] - TABLE_1 = [ + TABLE_0_1 = [ ["1-1", "1-1", "1-3", "1-4", "1-5", "1-6"], ["2-1", "2-2", "2-3", "2-4", "1-5", "1-6"], ["3-1", "3-2", "3-3", "3-4", "1-5", "1-6"], @@ -118,9 +118,14 @@ def test_merge_cell(self): ["7-1", "7-1", "7-1", "7-1", "1-5", "1-6"], ["7-1", "7-1", "7-1", "7-1", "7-5", "1-6"], ] + TABLE_1_0 = [["1", "2"], [None, "4"]] with pdfplumber.open(path) as pdf: tables = pdf.pages[0].extract_tables() assert len(tables) == 2 - assert TABLE_0 == tables[0] - assert TABLE_1 == tables[1] + assert TABLE_0_0 == tables[0] + assert TABLE_0_1 == tables[1] + + tables_missing_edges = pdf.pages[1].extract_tables() + assert len(tables_missing_edges) == 1 + assert TABLE_1_0 == tables_missing_edges[0]