From cc5058bfcfc1e3448c69fb334a734b002a75f559 Mon Sep 17 00:00:00 2001 From: skoulouzis Date: Mon, 22 May 2017 16:32:07 +0200 Subject: [PATCH] Catch formatter exception --- .../prepare/controller/DataPrepare.java | 278 +++++++++--------- rest/nb-configuration.xml | 1 + 2 files changed, 143 insertions(+), 136 deletions(-) diff --git a/classification/src/main/java/eu/edisonproject/classification/prepare/controller/DataPrepare.java b/classification/src/main/java/eu/edisonproject/classification/prepare/controller/DataPrepare.java index d1ae25c..e8d1f6d 100644 --- a/classification/src/main/java/eu/edisonproject/classification/prepare/controller/DataPrepare.java +++ b/classification/src/main/java/eu/edisonproject/classification/prepare/controller/DataPrepare.java @@ -59,152 +59,152 @@ */ public class DataPrepare implements IDataPrepare { - private String inputFolder; - private String outputFolder; - private LinkedList documentObjectList; - private DocumentObject documentObject; - private String charArraySetPath; + private String inputFolder; + private String outputFolder; + private LinkedList documentObjectList; + private DocumentObject documentObject; + private String charArraySetPath; // private CharArraySet stopWordArraySet; // private ReaderFile fileReader; // private static final int maxNumberOfAvroPerFile = 10; - private final StopWord cleanStopWord; - private final StanfordLemmatizer cleanLemmatisation; - - public DataPrepare(String inputFolder, String outputFolder, String stopWordsPath) { - this.inputFolder = inputFolder; - this.outputFolder = outputFolder; - documentObjectList = new LinkedList<>(); - CharArraySet stopWordArraySet = new CharArraySet(ConfigHelper.loadStopWords(stopWordsPath), true); - cleanStopWord = new StopWord(stopWordArraySet); - cleanLemmatisation = new StanfordLemmatizer(); - } - - @Override - public void execute() { - File file = new File(inputFolder); - Document davro; - DocumentAvroSerializer dAvroSerializer = null; - if (file.isDirectory()) { - File[] filesInDir = file.listFiles(); + private final StopWord cleanStopWord; + private final StanfordLemmatizer cleanLemmatisation; + + public DataPrepare(String inputFolder, String outputFolder, String stopWordsPath) { + this.inputFolder = inputFolder; + this.outputFolder = outputFolder; + documentObjectList = new LinkedList<>(); + CharArraySet stopWordArraySet = new CharArraySet(ConfigHelper.loadStopWords(stopWordsPath), true); + cleanStopWord = new StopWord(stopWordArraySet); + cleanLemmatisation = new StanfordLemmatizer(); + } + + @Override + public void execute() { + File file = new File(inputFolder); + Document davro; + DocumentAvroSerializer dAvroSerializer = null; + if (file.isDirectory()) { + File[] filesInDir = file.listFiles(); // Arrays.sort(filesInDir); // LocalDate date = getCreationDate(file); - for (File f : filesInDir) { - if (f.isFile() && FilenameUtils.getExtension(f.getName()).endsWith("txt")) { - LocalDate date = getCreationDate(f); - documentObject = new DocumentObject(); - documentObject.setDate(date); - ReaderFile rf = new ReaderFile(f.getAbsolutePath()); - String contents = rf.readFile(); - cleanStopWord.setDescription(contents); - String cleanCont = cleanStopWord.execute().toLowerCase(); - cleanLemmatisation.setDescription(cleanCont); - cleanCont = cleanLemmatisation.execute(); - documentObject.setDescription(cleanCont); - documentObject.setDocumentId(FilenameUtils.removeExtension(f.getName())); - documentObject.setTitle(f.getParentFile().getName()); + for (File f : filesInDir) { + if (f.isFile() && FilenameUtils.getExtension(f.getName()).endsWith("txt")) { + LocalDate date = getCreationDate(f); + documentObject = new DocumentObject(); + documentObject.setDate(date); + ReaderFile rf = new ReaderFile(f.getAbsolutePath()); + String contents = rf.readFile(); + cleanStopWord.setDescription(contents); + String cleanCont = cleanStopWord.execute().toLowerCase(); + cleanLemmatisation.setDescription(cleanCont); + cleanCont = cleanLemmatisation.execute(); + documentObject.setDescription(cleanCont); + documentObject.setDocumentId(FilenameUtils.removeExtension(f.getName())); + documentObject.setTitle(f.getParentFile().getName()); // extract(this.getDocumentObject(), f.getPath()); // documentObject.setDescription(documentObject.getDescription().toLowerCase()); // clean(this.getDocumentObject().getDescription()); - if (documentObject.getDescription().equals("")) { - continue; - } - documentObjectList.add(this.getDocumentObject()); - - davro = new Document(); - davro.setDocumentId(documentObject.getDocumentId()); - davro.setTitle(documentObject.getTitle()); - davro.setDate(documentObject.getDate().toString()); - davro.setDescription(documentObject.getDescription()); - - if (dAvroSerializer == null) { - dAvroSerializer = new DocumentAvroSerializer(outputFolder - + File.separator + documentObject.getTitle().replaceAll(" ", "_") - + date + ".avro", davro.getSchema()); - } - Logger.getLogger(Text2Avro.class.getName()).log(Level.INFO, "Adding :{0} to: {1}{2}{3}{4}.avro", new Object[]{documentObject.getDocumentId(), outputFolder, File.separator, documentObject.getTitle().replaceAll(" ", "_"), date}); - dAvroSerializer.serialize(davro); + if (documentObject.getDescription().equals("")) { + continue; + } + documentObjectList.add(this.getDocumentObject()); + + davro = new Document(); + davro.setDocumentId(documentObject.getDocumentId()); + davro.setTitle(documentObject.getTitle()); + davro.setDate(documentObject.getDate().toString()); + davro.setDescription(documentObject.getDescription()); + + if (dAvroSerializer == null) { + dAvroSerializer = new DocumentAvroSerializer(outputFolder + + File.separator + documentObject.getTitle().replaceAll(" ", "_") + + date + ".avro", davro.getSchema()); + } + Logger.getLogger(Text2Avro.class.getName()).log(Level.INFO, "Adding :{0} to: {1}{2}{3}{4}.avro", new Object[]{documentObject.getDocumentId(), outputFolder, File.separator, documentObject.getTitle().replaceAll(" ", "_"), date}); + dAvroSerializer.serialize(davro); + } + + } + + if (dAvroSerializer != null) { + dAvroSerializer.close(); + dAvroSerializer = null; + } } - } - - if (dAvroSerializer != null) { - dAvroSerializer.close(); - dAvroSerializer = null; - } } - } - - @Override - public void extract(DocumentObject jp, String filePath) { - Extractor extractorTitle = new Title(); - extractorTitle.setJp(jp); - extractorTitle.setFilePath(filePath); - extractorTitle.readFromFile(); - extractorTitle.extract(); - - Extractor extractorDate = new Date(); - extractorDate.setJp(extractorTitle.getJp()); - extractorDate.extract(); - - Extractor extractorText = new Text(); - extractorText.setJp(extractorDate.getJp()); - extractorText.extract(); - } + @Override + public void extract(DocumentObject jp, String filePath) { + Extractor extractorTitle = new Title(); + extractorTitle.setJp(jp); + extractorTitle.setFilePath(filePath); + extractorTitle.readFromFile(); + extractorTitle.extract(); + + Extractor extractorDate = new Date(); + extractorDate.setJp(extractorTitle.getJp()); + extractorDate.extract(); + + Extractor extractorText = new Text(); + extractorText.setJp(extractorDate.getJp()); + extractorText.extract(); + } - @Override - public void clean(String description) { - //System.out.println("DESCRIZIONE"+description); + @Override + public void clean(String description) { + //System.out.println("DESCRIZIONE"+description); // Cleaner cleanStopWord = new StopWord(this.getStopWordArraySet()); - cleanStopWord.setDescription(description); - documentObject.setDescription(cleanStopWord.execute()); - //System.out.println(documentObject.getDescription()); - Cleaner cleanStanfordLemmatizer = new StanfordLemmatizer(); - cleanStanfordLemmatizer.setDescription(documentObject.getDescription()); - documentObject.setDescription(cleanStanfordLemmatizer.execute()); + cleanStopWord.setDescription(description); + documentObject.setDescription(cleanStopWord.execute()); + //System.out.println(documentObject.getDescription()); + Cleaner cleanStanfordLemmatizer = new StanfordLemmatizer(); + cleanStanfordLemmatizer.setDescription(documentObject.getDescription()); + documentObject.setDescription(cleanStanfordLemmatizer.execute()); - } + } - public String getInputFolder() { - return inputFolder; - } + public String getInputFolder() { + return inputFolder; + } - public void setInputFolder(String inputFolder) { - this.inputFolder = inputFolder; - } + public void setInputFolder(String inputFolder) { + this.inputFolder = inputFolder; + } - public String getOutputFolder() { - return outputFolder; - } + public String getOutputFolder() { + return outputFolder; + } - public void setOutputFolder(String outputFolder) { - this.outputFolder = outputFolder; - } + public void setOutputFolder(String outputFolder) { + this.outputFolder = outputFolder; + } - public LinkedList getJobPostList() { - return documentObjectList; - } + public LinkedList getJobPostList() { + return documentObjectList; + } - public void setDocumentObjectList(LinkedList jdocumentObjectList) { - this.documentObjectList = jdocumentObjectList; - } + public void setDocumentObjectList(LinkedList jdocumentObjectList) { + this.documentObjectList = jdocumentObjectList; + } - public DocumentObject getDocumentObject() { - return documentObject; - } + public DocumentObject getDocumentObject() { + return documentObject; + } - public void setJDocumentObject(DocumentObject jobPost) { - this.documentObject = jobPost; - } + public void setJDocumentObject(DocumentObject jobPost) { + this.documentObject = jobPost; + } - public String getCharArraySetPath() { - return charArraySetPath; - } + public String getCharArraySetPath() { + return charArraySetPath; + } - public void setCharArraySetPath(String charArraySetPath) { - this.charArraySetPath = charArraySetPath; - } + public void setCharArraySetPath(String charArraySetPath) { + this.charArraySetPath = charArraySetPath; + } // public CharArraySet getStopWordArraySet() { // return stopWordArraySet; @@ -213,23 +213,29 @@ public void setCharArraySetPath(String charArraySetPath) { // public void setStopWordArraySet(CharArraySet charArraySet) { // this.stopWordArraySet = charArraySet; // } - private LocalDate getCreationDate(File file) { - Path p = Paths.get(file.getAbsolutePath()); - BasicFileAttributes attr = null; - try { - attr = Files.readAttributes(p, BasicFileAttributes.class); - } catch (IOException ex) { - Logger.getLogger(Text2Avro.class.getName()).log(Level.SEVERE, null, ex); - } - FileTime ct = attr.creationTime(); - DateTimeFormatter formatter; + private LocalDate getCreationDate(File file) { + Path p = Paths.get(file.getAbsolutePath()); + BasicFileAttributes attr = null; + try { + attr = Files.readAttributes(p, BasicFileAttributes.class); + } catch (IOException ex) { + Logger.getLogger(Text2Avro.class.getName()).log(Level.SEVERE, null, ex); + } + FileTime ct = attr.creationTime(); + DateTimeFormatter formatter; -// formatter = DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ss.SSSSSS'Z'"); +// // LocalDate.parse("2016-09-18T11:40:03.750522Z", formatter); - formatter = DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ss'Z'"); - LocalDate date = LocalDate.parse(ct.toString(), formatter); - Logger.getLogger(Text2Avro.class.getName()).log(Level.INFO, "CreationDate: {0}", date); - return date; - } + formatter = DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ss'Z'"); + LocalDate date = null; + try { + date = LocalDate.parse(ct.toString(), formatter); + } catch (java.lang.IllegalArgumentException ex) { + formatter = DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ss.SSSSSS'Z'"); + date = LocalDate.parse(ct.toString(), formatter); + } + Logger.getLogger(Text2Avro.class.getName()).log(Level.INFO, "CreationDate: {0}", date); + return date; + } } diff --git a/rest/nb-configuration.xml b/rest/nb-configuration.xml index c2decf1..f46fc52 100644 --- a/rest/nb-configuration.xml +++ b/rest/nb-configuration.xml @@ -14,5 +14,6 @@ That way multiple projects can share the same settings (useful for formatting ru Any value defined here will override the pom.xml file value but is only applicable to the current project. --> JDK_1.7 + false