Skip to content

Commit

Permalink
Merge pull request #192 from jamebal/develop
Browse files Browse the repository at this point in the history
fix: 修复部分word和pdf文档检索不到内容的问题
  • Loading branch information
jamebal authored Dec 4, 2024
2 parents 388dedc + 10b9653 commit ea68dfa
Showing 1 changed file with 122 additions and 69 deletions.
191 changes: 122 additions & 69 deletions src/main/java/com/jmal/clouddisk/lucene/ReadContentService.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,10 @@
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.hslf.usermodel.HSLFShape;
import org.apache.poi.hslf.usermodel.HSLFSlide;
import org.apache.poi.hslf.usermodel.HSLFSlideShow;
import org.apache.poi.hslf.usermodel.HSLFTextShape;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
Expand All @@ -36,8 +38,7 @@
import org.apache.poi.xslf.usermodel.XSLFSlide;
import org.apache.poi.xslf.usermodel.XSLFTextShape;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.*;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.springframework.stereotype.Service;
Expand All @@ -50,7 +51,6 @@
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
import java.util.List;
import java.util.regex.Pattern;

@Service
Expand Down Expand Up @@ -83,9 +83,22 @@ public String dwg2mxweb(File file, String fileId) {
return null;
}

public static boolean checkPageContent(PDDocument document, int pageIndex) throws IOException {
PDPage page = document.getPage(pageIndex); // 获取页面
// 检查图片内容
PDResources resources = page.getResources();
for (COSName xObjectName : resources.getXObjectNames()) {
PDXObject xObject = resources.getXObject(xObjectName);
if (xObject instanceof PDImageXObject) {
// 如果找到至少一张图片,则可以提前退出
return true;
}
}
return false;
}

public String readPdfContent(File file, String fileId) {
try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile(file))) {

String username = commonFileService.getUsernameByAbsolutePath(Path.of(file.getAbsolutePath()));

// 生成封面图像
Expand All @@ -95,35 +108,21 @@ public String readPdfContent(File file, String fileId) {
}

StringBuilder content = new StringBuilder();
// 提取每一页的内容
PDFRenderer pdfRenderer = new PDFRenderer(document);
PDFTextStripper pdfStripper = new PDFTextStripper();
for (int pageNumber = 1; pageNumber <= document.getNumberOfPages(); pageNumber++) {
pdfStripper.setStartPage(pageNumber);
pdfStripper.setEndPage(pageNumber);

for (int pageIndex = 0; pageIndex < document.getNumberOfPages(); pageIndex++) { // 使用 0-based 索引
pdfStripper.setStartPage(pageIndex + 1); // PDFTextStripper 使用 1-based 索引
pdfStripper.setEndPage(pageIndex + 1);
String text = pdfStripper.getText(document).trim();

// 如果页面包含文字,添加提取的文字
if (!text.isEmpty()) {
content.append(text);
} else {
taskProgressService.addTaskProgress(file, TaskType.OCR, pageNumber + "/" + document.getNumberOfPages());
PDPage page = document.getPage(pageNumber - 1);
PDResources resources = page.getResources();
for (COSName xObjectName : resources.getXObjectNames()) {
PDXObject xObject = resources.getXObject(xObjectName);
if (xObject instanceof PDImageXObject image) {
BufferedImage bufferedImage = image.getImage();
// 将图像保存到临时文件
String tempImageFile = ocrService.generateOrcTempImagePath(username);
ImageIO.write(bufferedImage, "png", new File(tempImageFile));
try {
// 使用 Tesseract 进行 OCR 识别
String ocrResult = ocrService.doOCR(tempImageFile, ocrService.generateOrcTempImagePath(username));
content.append(ocrResult);
} finally {
// 删除临时文件
FileUtil.del(tempImageFile);
}
}
}
}
// 如果页面包含图片或没有文字,则进行 OCR
if (checkPageContent(document, pageIndex) || text.isEmpty()) {
content.append(extractPageWithOCR(pdfRenderer, pageIndex, username));
}
}
return content.toString();
Expand All @@ -135,6 +134,23 @@ public String readPdfContent(File file, String fileId) {
return null;
}

private String extractPageWithOCR(PDFRenderer pdfRenderer, int pageIndex, String username) {
try {
BufferedImage pageImage = pdfRenderer.renderImageWithDPI(pageIndex, 300);
String tempImageFile = ocrService.generateOrcTempImagePath(username);
ImageIO.write(pageImage, "png", new File(tempImageFile));
try {
// 使用 OCR 识别页面内容
return ocrService.doOCR(tempImageFile, ocrService.generateOrcTempImagePath(username));
} finally {
FileUtil.del(tempImageFile);
}
} catch (Exception e) {
log.error("Error processing page {}", pageIndex + 1, e);
return "";
}
}

public String readEpubContent(File file, String fileId) {
try (InputStream fileInputStream = new FileInputStream(file)) {
// 打开 EPUB 文件
Expand Down Expand Up @@ -177,48 +193,74 @@ public String readPPTContent(File file) {
if (fileName.endsWith(".pptx")) {
// 读取 .pptx 文件
try (XMLSlideShow pptx = new XMLSlideShow(fis)) {
for (XSLFSlide slide : pptx.getSlides()) {
for (XSLFShape shape : slide.getShapes()) {
if (shape instanceof XSLFTextShape textShape) {
stringBuilder.append(textShape.getText()).append(" ");
}
}
}
readSlides(pptx.getSlides(), stringBuilder);
}
} else if (fileName.endsWith(".ppt")) {
// 读取 .ppt 文件
try (HSLFSlideShow ppt = new HSLFSlideShow(fis)) {
for (org.apache.poi.hslf.usermodel.HSLFSlide slide : ppt.getSlides()) {
for (HSLFShape shape : slide.getShapes()) {
if (shape instanceof HSLFTextShape textShape) {
stringBuilder.append(textShape.getText()).append(" ");
}
}
}
readSlides(ppt.getSlides(), stringBuilder);
}
} else {
throw new IllegalArgumentException("不支持的文件格式");
throw new IllegalArgumentException("Unsupported file format: " + fileName);
}
} catch (IOException e) {
FileContentUtil.readFailed(file, e);
}

return stringBuilder.toString().trim();
}

// 通用方法读取幻灯片中的文本内容
private void readSlides(Iterable<?> slides, StringBuilder content) {
for (Object slide : slides) {
if (slide instanceof XSLFSlide xslfSlide) {
for (XSLFShape shape : xslfSlide.getShapes()) {
if (shape instanceof XSLFTextShape textShape) {
content.append(textShape.getText()).append(" ");
}
}
} else if (slide instanceof HSLFSlide hslfSlide) {
for (HSLFShape shape : hslfSlide.getShapes()) {
if (shape instanceof HSLFTextShape textShape) {
content.append(textShape.getText()).append(" ");
}
}
}
}
}
public String readWordContent(File file) {
try (FileInputStream fis = new FileInputStream(file)) {
try {
// 尝试读取 OOXML 格式 (.docx) 文件
// 读取 DOCX 文件
XWPFDocument document = new XWPFDocument(fis);
StringBuilder stringBuilder = new StringBuilder();
List<XWPFParagraph> paragraphs = document.getParagraphs();
for (XWPFParagraph para : paragraphs) {
stringBuilder.append(para.getText());

// 1. 读取段落
for (XWPFParagraph paragraph : document.getParagraphs()) {
stringBuilder.append(paragraph.getText()).append("\n");
}

// 2. 读取表格
for (XWPFTable table : document.getTables()) {
for (XWPFTableRow row : table.getRows()) {
for (XWPFTableCell cell : row.getTableCells()) {
stringBuilder.append(cell.getText()).append("\t");
}
stringBuilder.append("\n");
}
}

// 3. 读取页眉
for (XWPFHeader header : document.getHeaderList()) {
stringBuilder.append(header.getText()).append("\n");
}

// 4. 读取页脚
for (XWPFFooter footer : document.getFooterList()) {
stringBuilder.append(footer.getText()).append("\n");
}
return stringBuilder.toString();
} catch (OLE2NotOfficeXmlFileException e) {
// 如果文件不是 OOXML 格式,尝试读取 OLE2 格式 (.doc) 文件
// 读取 DOC 文件
try (FileInputStream fis2 = new FileInputStream(file);
POIFSFileSystem poifs = new POIFSFileSystem(fis2);
HWPFDocument doc = new HWPFDocument(poifs)) {
Expand All @@ -232,37 +274,48 @@ public String readWordContent(File file) {
return null;
}

private static final Pattern NON_NUMERIC_PATTERN = Pattern.compile("[^0-9]+");
// 匹配包含至少一个中文或英文字符的字符串
private static final Pattern TEXT_PATTERN = Pattern.compile(".*[a-zA-Z一-龥]+.*");

public String readExcelContent(File file) {
StringBuilder content = new StringBuilder();
try (FileInputStream fis = new FileInputStream(file)) {
Workbook workbook;
if (file.getName().endsWith(".xlsx")) {
workbook = new XSSFWorkbook(fis);
} else if (file.getName().endsWith(".xls")) {
workbook = new HSSFWorkbook(fis);
} else {
throw new IllegalArgumentException("不支持的文件格式");
}

for (Sheet sheet : workbook) {
for (Row row : sheet) {
for (Cell cell : row) {
String cellValue = getCellValueAsString(cell);
// 过滤掉数字,只保留文字
if (NON_NUMERIC_PATTERN.matcher(cellValue).matches()) {
content.append(cellValue).append(" ");
}
}
try (FileInputStream fis = new FileInputStream(file)) {
try (Workbook workbook = createWorkbook(file, fis)) {
for (Sheet sheet : workbook) {
readSheetContent(sheet, content);
}
}
} catch (IOException e) {
FileContentUtil.readFailed(file, e);
} catch (IllegalArgumentException e) {
log.warn("Unsupported file format: {}", file.getName());
}
return content.toString().trim();
}

private Workbook createWorkbook(File file, FileInputStream fis) throws IOException {
if (file.getName().endsWith(".xlsx")) {
return new XSSFWorkbook(fis);
} else if (file.getName().endsWith(".xls")) {
return new HSSFWorkbook(fis);
} else {
throw new IllegalArgumentException("Unsupported file format");
}
}

private void readSheetContent(Sheet sheet, StringBuilder content) {
for (Row row : sheet) {
for (Cell cell : row) {
String cellValue = getCellValueAsString(cell);
// 包含至少一个中文或英文字符
if (TEXT_PATTERN.matcher(cellValue).matches()) {
content.append(cellValue).append(" ");
}
}
}
}

private String getCellValueAsString(Cell cell) {
if (cell == null) {
return "";
Expand Down

0 comments on commit ea68dfa

Please sign in to comment.