Skip to content

Commit

Permalink
Merge pull request #28 from ContentMine/dev
Browse files Browse the repository at this point in the history
Update to 0.2.26
  • Loading branch information
tarrow committed Apr 19, 2016
2 parents 01957ab + e475526 commit 520854f
Show file tree
Hide file tree
Showing 447 changed files with 1,232,767 additions and 683 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@
target/
docs/
src/test/resources/org/xmlcml/norma/pubstyle/getpapers/anopheles/
/target/
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
language: java
1 change: 1 addition & 0 deletions JATS-archivearticle1.dtd
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<!-- dummy -->
2 changes: 1 addition & 1 deletion docs/TUTORIAL.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ This means

This should create the 11 XML files in `target/plos10/`.

Then we convert them. Now the topr directory is `target/plos10/`. It's not a CM directory but it has many child CM directories and converts each. We use the same command as before:
Then we convert them. Now the top directory is `target/plos10/`. It's not a CM directory but it has many child CM directories and converts each. We use the same command as before:
```
norma -q target/plos10/ -i fulltext.xml -o scholarly.html --xsl nlm2html
```
Expand Down
1 change: 1 addition & 0 deletions loose.dtd
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<!-- dummy DTD file -->
31 changes: 30 additions & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@
<name>norma</name>
<description>A Java library for processing multiple legacy formats into normalized HTML5</description>

<properties>
<opennlp.version>1.6.0</opennlp.version>
</properties>
<licenses>
<license>
<name>Apache License, Version 2.0</name>
Expand Down Expand Up @@ -53,6 +56,7 @@
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>cobertura-maven-plugin</artifactId>
<version>2.7</version>
<configuration>
<check>
<haltOnFailure>false</haltOnFailure>
Expand Down Expand Up @@ -92,6 +96,7 @@
<plugin>
<groupId>com.mycila.maven-license-plugin</groupId>
<artifactId>maven-license-plugin</artifactId>
<version>1.10.b1</version>
<configuration>
<header>src/main/resources/header.txt</header>
</configuration>
Expand Down Expand Up @@ -185,7 +190,25 @@
</dataSet>
</configuration>
</plugin>

<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.5</version>
<configuration>
<source>1.5</source>
<target>1.5</target>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.12.4</version>
<configuration>
<forkCount>3</forkCount>
<reuseForks>true</reuseForks>
<argLine>-Xmx1024m -XX:MaxPermSize=256m</argLine>
</configuration>
</plugin>
</plugins>
</build>

Expand Down Expand Up @@ -214,6 +237,12 @@
<artifactId>svg2xml</artifactId>
<version>0.1-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.apache.opennlp</groupId>
<artifactId>opennlp-tools</artifactId>
<version>${opennlp.version}</version>
</dependency>

<!-- https://github.com/jayway/JsonPath/blob/master/README.md -->
<dependency>
<groupId>com.jayway.jsonpath</groupId>
Expand Down
101 changes: 54 additions & 47 deletions src/main/java/org/xmlcml/norma/NormaArgProcessor.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import org.xmlcml.cmine.args.StringPair;
import org.xmlcml.cmine.args.ValueElement;
import org.xmlcml.cmine.args.VersionManager;
import org.xmlcml.cmine.files.CMDir;
import org.xmlcml.cmine.files.CTree;
import org.xmlcml.html.HtmlElement;
import org.xmlcml.norma.image.ocr.NamedImage;
import org.xmlcml.norma.input.html.HtmlCleaner;
Expand All @@ -46,7 +46,7 @@ public class NormaArgProcessor extends DefaultArgProcessor {

public final static String HELP_NORMA = "Norma help";

private static String RESOURCE_NAME_TOP = "/org/xmlcml/norma";
public static String RESOURCE_NAME_TOP = "/org/xmlcml/norma";
private static String ARGS_RESOURCE = RESOURCE_NAME_TOP+"/"+"args.xml";
private static final VersionManager NORMA_VERSION_MANAGER = new VersionManager();

Expand Down Expand Up @@ -193,12 +193,16 @@ public void transform(ArgumentOption option) {
}

public void runTransform(ArgumentOption option) {
boolean ok = false;
if (currentCTree == null) {
LOG.warn("No current CMDir");
LOG.warn("No current CTree");
} else {
LOG.trace("***run transform "+currentCTree);
getOrCreateNormaTransformer();
normaTransformer.transform(option);
ok = normaTransformer.transform(option);
if (!ok) {
currentCTree = null;
}
}
}

Expand Down Expand Up @@ -266,93 +270,96 @@ void writeImages() {
// ==========================


public File checkAndGetInputFile(CMDir cmDir) {
if (cmDir == null) {
throw new RuntimeException("null cmDir");
public File checkAndGetInputFile(CTree cTree) {
if (cTree == null) {
throw new RuntimeException("null cTree");
}
String inputName = getString();
if (inputName == null) {
throw new RuntimeException("Must have single input option");
}
if (!CMDir.isReservedFilename(inputName) && !CMDir.hasReservedParentDirectory(inputName) ) {
if (!CTree.isReservedFilename(inputName) && !CTree.hasReservedParentDirectory(inputName) ) {
throw new RuntimeException("Input must be reserved file; found: "+inputName);
}
File inputFile = cmDir.getExistingReservedFile(inputName);
File inputFile = cTree.getExistingReservedFile(inputName);
if (inputFile == null) {
inputFile = cmDir.getExistingFileWithReservedParentDirectory(inputName);
inputFile = cTree.getExistingFileWithReservedParentDirectory(inputName);
}
if (inputFile == null) {
throw new RuntimeException("Could not find input file "+inputName+" in directory "+cmDir.getDirectory());
String msg = "Could not find input file "+inputName+" in directory "+cTree.getDirectory();
TREE_LOG().error(msg);
System.err.print("!");
// throw new RuntimeException(msg);
}
return inputFile;
}

private void createCMDirListFromInputList() {
private void createCTreeListFromInputList() {
// proceed unless there is a single reserved file for input
if (CMDir.isNonEmptyNonReservedInputList(inputList)) {
LOG.trace("CREATING CMDir FROM INPUT:"+inputList);
if (CTree.isNonEmptyNonReservedInputList(inputList)) {
LOG.trace("CREATING CTree FROM INPUT:"+inputList);
// this actually creates directory
getOrCreateOutputDirectory();
ensureCTreeList();
createNewCMDirsAndCopyOriginalFilesAndAddToList();
createNewCTreesAndCopyOriginalFilesAndAddToList();
}
}

private void createNewCMDirsAndCopyOriginalFilesAndAddToList() {
private void createNewCTreesAndCopyOriginalFilesAndAddToList() {
ensureCTreeList();
for (String filename : inputList) {
try {
CMDir cmDir = createCMDirAndCopyFileOrMakeSubDirectory(filename);
if (cmDir != null) {
cTreeList.add(cmDir);
CTree cTree = createCTreeAndCopyFileOrMakeSubDirectory(filename);
if (cTree != null) {
cTreeList.add(cTree);
}
} catch (IOException e) {
LOG.error("Failed to create CMDir: "+filename+"; "+e);
LOG.error("Failed to create CTree: "+filename+"; "+e);
}
}
}

private CMDir createCMDirAndCopyFileOrMakeSubDirectory(String filename) throws IOException {
CMDir cmDir = null;
private CTree createCTreeAndCopyFileOrMakeSubDirectory(String filename) throws IOException {
CTree cTree = null;
File file = new File(filename);
if (file.isDirectory()) {
LOG.error("should not have any directories in inputList: "+file);
this.PROJECT_LOG().error("should not have any directories in inputList: "+file);
} else {
if (output != null) {
String name = FilenameUtils.getName(filename);
if (CMDir.isReservedFilename(name)) {
LOG.error(name+" is reserved for CMDir: (check that inputs are not already in a CMDir) "+file.getAbsolutePath());
if (CTree.isReservedFilename(name)) {
this.PROJECT_LOG().info(name+" is reserved for CTree: (check that inputs are not already in a CTree) "+file.getAbsolutePath());
}
String cmFilename = CMDir.getCMDirReservedFilenameForExtension(name);
String cmFilename = CTree.getCTreeReservedFilenameForExtension(name);
if (cmFilename == null) {
LOG.error("Cannot create CMDir from this type of file: "+name);
this.PROJECT_LOG().error("Cannot create CTree from this type of file: "+name);
return null;
}
LOG.trace("Reserved filename: "+cmFilename);
if (CMDir.isReservedDirectory(cmFilename)) {
cmDir = makeCMDir(name);
ensureReservedDirectoryAndCopyFile(cmDir, cmFilename, filename);
if (CTree.isReservedDirectory(cmFilename)) {
cTree = makeCTree(name);
ensureReservedDirectoryAndCopyFile(cTree, cmFilename, filename);
} else {
cmDir = makeCMDir(name);
File destFile = cmDir.getReservedFile(cmFilename);
cTree = makeCTree(name);
File destFile = cTree.getReservedFile(cmFilename);
if (destFile != null) {
FileUtils.copyFile(file, destFile);
}
}
}
}
return cmDir;
return cTree;
}

private CMDir makeCMDir(String name) {
CMDir cmDir;
private CTree makeCTree(String name) {
CTree cTree;
String dirName = FilenameUtils.removeExtension(name);
cmDir = createCMDir(dirName);
return cmDir;
cTree = createCTree(dirName);
return cTree;
}

private void ensureReservedDirectoryAndCopyFile(CMDir cmDir, String reservedFilename, String filename) {
File reservedDir = new File(cmDir.getDirectory(), reservedFilename);
private void ensureReservedDirectoryAndCopyFile(CTree cTree, String reservedFilename, String filename) {
File reservedDir = new File(cTree.getDirectory(), reservedFilename);
LOG.trace("Res "+reservedDir.getAbsolutePath());
File orig = new File(filename);
LOG.trace("Orig: "+orig.getAbsolutePath());
Expand All @@ -372,19 +379,19 @@ private void ensureReservedDirectoryAndCopyFile(CMDir cmDir, String reservedFile

}

private CMDir createCMDir(String dirName) {
File cmDirFile = new File(output, dirName);
CMDir cmDir = new CMDir(cmDirFile);
cmDir.createDirectory(cmDirFile, false);
return cmDir;
private CTree createCTree(String dirName) {
File cTreeFile = new File(output, dirName);
CTree cTree = new CTree(cTreeFile);
cTree.createDirectory(cTreeFile, false);
return cTree;
}

private void getOrCreateOutputDirectory() {
if (output != null) {
File outputDir = new File(output);
if (outputDir.exists()) {
if (!outputDir.isDirectory()) {
throw new RuntimeException("cmDirRoot "+outputDir+" must be a directory");
throw new RuntimeException("cTreeRoot "+outputDir+" must be a directory");
}
} else {
outputDir.mkdirs();
Expand Down Expand Up @@ -439,10 +446,10 @@ public List<SectionTagger> getSectionTaggers() {
*/
public void parseArgs(String[] args) {
super.parseArgs(args);
createCMDirListFromInputList();
createCTreeListFromInputList();
}

public CMDir getCurrentCMDir() {
public CTree getCurrentCMTree() {
return currentCTree;
}

Expand Down
Loading

0 comments on commit 520854f

Please sign in to comment.