Skip to content
This repository has been archived by the owner on Nov 21, 2023. It is now read-only.

Commit

Permalink
Catch formatter exception
Browse files Browse the repository at this point in the history
  • Loading branch information
skoulouzis committed May 22, 2017
1 parent 7afa64c commit cc5058b
Show file tree
Hide file tree
Showing 2 changed files with 143 additions and 136 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -59,152 +59,152 @@
*/
public class DataPrepare implements IDataPrepare {

private String inputFolder;
private String outputFolder;
private LinkedList<DocumentObject> documentObjectList;
private DocumentObject documentObject;
private String charArraySetPath;
private String inputFolder;
private String outputFolder;
private LinkedList<DocumentObject> documentObjectList;
private DocumentObject documentObject;
private String charArraySetPath;
// private CharArraySet stopWordArraySet;
// private ReaderFile fileReader;
// private static final int maxNumberOfAvroPerFile = 10;
private final StopWord cleanStopWord;
private final StanfordLemmatizer cleanLemmatisation;

public DataPrepare(String inputFolder, String outputFolder, String stopWordsPath) {
this.inputFolder = inputFolder;
this.outputFolder = outputFolder;
documentObjectList = new LinkedList<>();
CharArraySet stopWordArraySet = new CharArraySet(ConfigHelper.loadStopWords(stopWordsPath), true);
cleanStopWord = new StopWord(stopWordArraySet);
cleanLemmatisation = new StanfordLemmatizer();
}

@Override
public void execute() {
File file = new File(inputFolder);
Document davro;
DocumentAvroSerializer dAvroSerializer = null;
if (file.isDirectory()) {
File[] filesInDir = file.listFiles();
private final StopWord cleanStopWord;
private final StanfordLemmatizer cleanLemmatisation;

public DataPrepare(String inputFolder, String outputFolder, String stopWordsPath) {
this.inputFolder = inputFolder;
this.outputFolder = outputFolder;
documentObjectList = new LinkedList<>();
CharArraySet stopWordArraySet = new CharArraySet(ConfigHelper.loadStopWords(stopWordsPath), true);
cleanStopWord = new StopWord(stopWordArraySet);
cleanLemmatisation = new StanfordLemmatizer();
}

@Override
public void execute() {
File file = new File(inputFolder);
Document davro;
DocumentAvroSerializer dAvroSerializer = null;
if (file.isDirectory()) {
File[] filesInDir = file.listFiles();
// Arrays.sort(filesInDir);

// LocalDate date = getCreationDate(file);
for (File f : filesInDir) {
if (f.isFile() && FilenameUtils.getExtension(f.getName()).endsWith("txt")) {
LocalDate date = getCreationDate(f);
documentObject = new DocumentObject();
documentObject.setDate(date);
ReaderFile rf = new ReaderFile(f.getAbsolutePath());
String contents = rf.readFile();
cleanStopWord.setDescription(contents);
String cleanCont = cleanStopWord.execute().toLowerCase();
cleanLemmatisation.setDescription(cleanCont);
cleanCont = cleanLemmatisation.execute();
documentObject.setDescription(cleanCont);
documentObject.setDocumentId(FilenameUtils.removeExtension(f.getName()));
documentObject.setTitle(f.getParentFile().getName());
for (File f : filesInDir) {
if (f.isFile() && FilenameUtils.getExtension(f.getName()).endsWith("txt")) {
LocalDate date = getCreationDate(f);
documentObject = new DocumentObject();
documentObject.setDate(date);
ReaderFile rf = new ReaderFile(f.getAbsolutePath());
String contents = rf.readFile();
cleanStopWord.setDescription(contents);
String cleanCont = cleanStopWord.execute().toLowerCase();
cleanLemmatisation.setDescription(cleanCont);
cleanCont = cleanLemmatisation.execute();
documentObject.setDescription(cleanCont);
documentObject.setDocumentId(FilenameUtils.removeExtension(f.getName()));
documentObject.setTitle(f.getParentFile().getName());
// extract(this.getDocumentObject(), f.getPath());
// documentObject.setDescription(documentObject.getDescription().toLowerCase());
// clean(this.getDocumentObject().getDescription());
if (documentObject.getDescription().equals("")) {
continue;
}
documentObjectList.add(this.getDocumentObject());

davro = new Document();
davro.setDocumentId(documentObject.getDocumentId());
davro.setTitle(documentObject.getTitle());
davro.setDate(documentObject.getDate().toString());
davro.setDescription(documentObject.getDescription());

if (dAvroSerializer == null) {
dAvroSerializer = new DocumentAvroSerializer(outputFolder
+ File.separator + documentObject.getTitle().replaceAll(" ", "_")
+ date + ".avro", davro.getSchema());
}
Logger.getLogger(Text2Avro.class.getName()).log(Level.INFO, "Adding :{0} to: {1}{2}{3}{4}.avro", new Object[]{documentObject.getDocumentId(), outputFolder, File.separator, documentObject.getTitle().replaceAll(" ", "_"), date});
dAvroSerializer.serialize(davro);
if (documentObject.getDescription().equals("")) {
continue;
}
documentObjectList.add(this.getDocumentObject());

davro = new Document();
davro.setDocumentId(documentObject.getDocumentId());
davro.setTitle(documentObject.getTitle());
davro.setDate(documentObject.getDate().toString());
davro.setDescription(documentObject.getDescription());

if (dAvroSerializer == null) {
dAvroSerializer = new DocumentAvroSerializer(outputFolder
+ File.separator + documentObject.getTitle().replaceAll(" ", "_")
+ date + ".avro", davro.getSchema());
}
Logger.getLogger(Text2Avro.class.getName()).log(Level.INFO, "Adding :{0} to: {1}{2}{3}{4}.avro", new Object[]{documentObject.getDocumentId(), outputFolder, File.separator, documentObject.getTitle().replaceAll(" ", "_"), date});
dAvroSerializer.serialize(davro);
}

}

if (dAvroSerializer != null) {
dAvroSerializer.close();
dAvroSerializer = null;
}
}

}

if (dAvroSerializer != null) {
dAvroSerializer.close();
dAvroSerializer = null;
}
}

}

@Override
public void extract(DocumentObject jp, String filePath) {
Extractor extractorTitle = new Title();
extractorTitle.setJp(jp);
extractorTitle.setFilePath(filePath);
extractorTitle.readFromFile();
extractorTitle.extract();

Extractor extractorDate = new Date();
extractorDate.setJp(extractorTitle.getJp());
extractorDate.extract();

Extractor extractorText = new Text();
extractorText.setJp(extractorDate.getJp());
extractorText.extract();
}
@Override
public void extract(DocumentObject jp, String filePath) {
Extractor extractorTitle = new Title();
extractorTitle.setJp(jp);
extractorTitle.setFilePath(filePath);
extractorTitle.readFromFile();
extractorTitle.extract();

Extractor extractorDate = new Date();
extractorDate.setJp(extractorTitle.getJp());
extractorDate.extract();

Extractor extractorText = new Text();
extractorText.setJp(extractorDate.getJp());
extractorText.extract();
}

@Override
public void clean(String description) {
//System.out.println("DESCRIZIONE"+description);
@Override
public void clean(String description) {
//System.out.println("DESCRIZIONE"+description);
// Cleaner cleanStopWord = new StopWord(this.getStopWordArraySet());
cleanStopWord.setDescription(description);
documentObject.setDescription(cleanStopWord.execute());
//System.out.println(documentObject.getDescription());
Cleaner cleanStanfordLemmatizer = new StanfordLemmatizer();
cleanStanfordLemmatizer.setDescription(documentObject.getDescription());
documentObject.setDescription(cleanStanfordLemmatizer.execute());
cleanStopWord.setDescription(description);
documentObject.setDescription(cleanStopWord.execute());
//System.out.println(documentObject.getDescription());
Cleaner cleanStanfordLemmatizer = new StanfordLemmatizer();
cleanStanfordLemmatizer.setDescription(documentObject.getDescription());
documentObject.setDescription(cleanStanfordLemmatizer.execute());

}
}

public String getInputFolder() {
return inputFolder;
}
public String getInputFolder() {
return inputFolder;
}

public void setInputFolder(String inputFolder) {
this.inputFolder = inputFolder;
}
public void setInputFolder(String inputFolder) {
this.inputFolder = inputFolder;
}

public String getOutputFolder() {
return outputFolder;
}
public String getOutputFolder() {
return outputFolder;
}

public void setOutputFolder(String outputFolder) {
this.outputFolder = outputFolder;
}
public void setOutputFolder(String outputFolder) {
this.outputFolder = outputFolder;
}

public LinkedList<DocumentObject> getJobPostList() {
return documentObjectList;
}
public LinkedList<DocumentObject> getJobPostList() {
return documentObjectList;
}

public void setDocumentObjectList(LinkedList<DocumentObject> jdocumentObjectList) {
this.documentObjectList = jdocumentObjectList;
}
public void setDocumentObjectList(LinkedList<DocumentObject> jdocumentObjectList) {
this.documentObjectList = jdocumentObjectList;
}

public DocumentObject getDocumentObject() {
return documentObject;
}
public DocumentObject getDocumentObject() {
return documentObject;
}

public void setJDocumentObject(DocumentObject jobPost) {
this.documentObject = jobPost;
}
public void setJDocumentObject(DocumentObject jobPost) {
this.documentObject = jobPost;
}

public String getCharArraySetPath() {
return charArraySetPath;
}
public String getCharArraySetPath() {
return charArraySetPath;
}

public void setCharArraySetPath(String charArraySetPath) {
this.charArraySetPath = charArraySetPath;
}
public void setCharArraySetPath(String charArraySetPath) {
this.charArraySetPath = charArraySetPath;
}

// public CharArraySet getStopWordArraySet() {
// return stopWordArraySet;
Expand All @@ -213,23 +213,29 @@ public void setCharArraySetPath(String charArraySetPath) {
// public void setStopWordArraySet(CharArraySet charArraySet) {
// this.stopWordArraySet = charArraySet;
// }
private LocalDate getCreationDate(File file) {
Path p = Paths.get(file.getAbsolutePath());
BasicFileAttributes attr = null;
try {
attr = Files.readAttributes(p, BasicFileAttributes.class);
} catch (IOException ex) {
Logger.getLogger(Text2Avro.class.getName()).log(Level.SEVERE, null, ex);
}
FileTime ct = attr.creationTime();
DateTimeFormatter formatter;
private LocalDate getCreationDate(File file) {
Path p = Paths.get(file.getAbsolutePath());
BasicFileAttributes attr = null;
try {
attr = Files.readAttributes(p, BasicFileAttributes.class);
} catch (IOException ex) {
Logger.getLogger(Text2Avro.class.getName()).log(Level.SEVERE, null, ex);
}
FileTime ct = attr.creationTime();
DateTimeFormatter formatter;

// formatter = DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ss.SSSSSS'Z'");
//
// LocalDate.parse("2016-09-18T11:40:03.750522Z", formatter);
formatter = DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ss'Z'");
LocalDate date = LocalDate.parse(ct.toString(), formatter);
Logger.getLogger(Text2Avro.class.getName()).log(Level.INFO, "CreationDate: {0}", date);
return date;
}
formatter = DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ss'Z'");
LocalDate date = null;
try {
date = LocalDate.parse(ct.toString(), formatter);
} catch (java.lang.IllegalArgumentException ex) {
formatter = DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ss.SSSSSS'Z'");
date = LocalDate.parse(ct.toString(), formatter);
}
Logger.getLogger(Text2Avro.class.getName()).log(Level.INFO, "CreationDate: {0}", date);
return date;
}

}
1 change: 1 addition & 0 deletions rest/nb-configuration.xml
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,6 @@ That way multiple projects can share the same settings (useful for formatting ru
Any value defined here will override the pom.xml file value but is only applicable to the current project.
-->
<netbeans.hint.jdkPlatform>JDK_1.7</netbeans.hint.jdkPlatform>
<org-netbeans-modules-whitelist.whitelist-oracle>false</org-netbeans-modules-whitelist.whitelist-oracle>
</properties>
</project-shared-configuration>

0 comments on commit cc5058b

Please sign in to comment.