Skip to content

Commit

Permalink
#9 & #10 support for .rtf and .doc files
Browse files Browse the repository at this point in the history
  • Loading branch information
tramyardg committed May 28, 2018
1 parent b8541e0 commit 747199f
Show file tree
Hide file tree
Showing 6 changed files with 97 additions and 33 deletions.
5 changes: 5 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,11 @@
<artifactId>poi-ooxml</artifactId>
<version>3.13</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>3.13</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/com/cv/parser/CVparserMain.java
Original file line number Diff line number Diff line change
Expand Up @@ -133,11 +133,11 @@ protected void createContents() {
pdf.main();
superList.addAll(pdf.getContents());

MSExtractor ms = new MSExtractor(filesInPublicDir);
MSExtractor ms = new MSExtractor();
ms.main();
superList.addAll(ms.getContents());

TXTExtractor txt = new TXTExtractor(filesInPublicDir);
TXTExtractor txt = new TXTExtractor();
txt.main();
superList.addAll(txt.getContents());

Expand Down
4 changes: 3 additions & 1 deletion src/main/java/com/cv/parser/FileExtension.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ public FileExtension() {
}

public enum Ext {
PDF, DOC, DOCX, TXT
PDF, DOC, DOCX, TXT, RTF
}

public String get(Ext ext) {
Expand All @@ -20,6 +20,8 @@ public String get(Ext ext) {
return ".docx";
case TXT:
return ".txt";
case RTF:
return ".rtf";
}
return null;
}
Expand Down
55 changes: 33 additions & 22 deletions src/main/java/com/cv/parser/extract/MSExtractor.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,14 @@

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.lang3.ArrayUtils;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.slf4j.Logger;
Expand All @@ -17,45 +20,38 @@
import com.cv.parser.FileFinderByExt;

/**
* Supports MS Word 2004+.
* Supports MS Word 2004+ file extension .doc and .docx.
*
* @author RAYMARTHINKPAD
*
*/
public class MSExtractor implements IExtractor {
Logger logger = LoggerFactory.getLogger(MSExtractor.class);

FileExtension fe = new FileExtension();
FileFinderByExt find = new FileFinderByExt();
private FileExtension fileExtension = new FileExtension();
private FileFinderByExt fileFinderByExt = new FileFinderByExt();

File[] msDocs;
List<String> contents = new ArrayList<String>();

File[] filesInPublicDir;

public MSExtractor(File[] filesInPublicDir) {
this.filesInPublicDir = filesInPublicDir;
}
private File[] docxFiles;
private File[] docFiles;
private List<String> contents = new ArrayList<String>();

public void main() {
setFiles();
extractFiles();
}

public void setFiles() {
File[] doc = find.finder(fe.get(Ext.DOC));
File[] docx = find.finder(fe.get(Ext.DOCX));
if (doc.length != 0 && docx.length != 0) {
this.msDocs = ArrayUtils.addAll(doc, docx);
} else if (doc.length != 0) {
this.msDocs = doc;
} else if (docx.length != 0) {
this.msDocs = docx;
}
this.docxFiles = fileFinderByExt.finder(fileExtension.get(Ext.DOCX));
this.docFiles = fileFinderByExt.finder(fileExtension.get(Ext.DOC));
}

public void extractFiles() {
for (File file : msDocs) {
extractDocxFiles();
extractDocFiles();
}

public void extractDocxFiles() {
for (File file : docxFiles) {
FileInputStream fs = null;
XWPFDocument msDoc = null;
XWPFWordExtractor we = null;
Expand All @@ -77,7 +73,22 @@ public void extractFiles() {
}
}
}


public void extractDocFiles() {
for (File file : docFiles) {
HWPFDocument hwpfdoc;
WordExtractor extractor;
try {
hwpfdoc = new HWPFDocument(new FileInputStream(file));
extractor = new WordExtractor(hwpfdoc);
this.contents.add(extractor.getText());
extractor.close();
} catch (IOException e) {
logger.info(e.getMessage());
}
}
}

public List<String> getContents() {
return contents;
}
Expand Down
52 changes: 52 additions & 0 deletions src/main/java/com/cv/parser/extract/RTFExtractor.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
package com.cv.parser.extract;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import javax.swing.text.BadLocationException;
import javax.swing.text.Document;
import javax.swing.text.rtf.RTFEditorKit;

import com.cv.parser.FileExtension;
import com.cv.parser.FileExtension.Ext;
import com.cv.parser.FileFinderByExt;

public class RTFExtractor implements IExtractor {

private File[] files;
private List<String> contents = new ArrayList<>();

private FileExtension fileExtension = new FileExtension();
private FileFinderByExt fileFinderByExt = new FileFinderByExt();

@Override
public void main() {
setFiles();
extractFiles();
}

@Override
public void setFiles() {
this.files = fileFinderByExt.finder(fileExtension.get(Ext.RTF));
}

@Override
public void extractFiles() {
for (File file : files) {
FileInputStream stream;
try {
stream = new FileInputStream(file);
RTFEditorKit kit = new RTFEditorKit();
Document doc = kit.createDefaultDocument();
kit.read(stream, doc, 0);
String plainText = doc.getText(0, doc.getLength());
this.contents.add(plainText);
} catch (IOException | BadLocationException e) {
e.printStackTrace();
}
}
}
}
10 changes: 2 additions & 8 deletions src/main/java/com/cv/parser/extract/TXTExtractor.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,8 @@ public class TXTExtractor implements IExtractor {
private FileExtension fe = new FileExtension();
private FileFinderByExt find = new FileFinderByExt();

File[] txtFiles;
List<String> contents = new ArrayList<String>();

File[] filesInPublicDir;

public TXTExtractor(File[] filesInPublicDir) {
this.filesInPublicDir = filesInPublicDir;
}
private File[] txtFiles;
private List<String> contents = new ArrayList<String>();

public void main() {
setFiles();
Expand Down

0 comments on commit 747199f

Please sign in to comment.