Skip to content

Commit

Permalink
Added the ability to dump all the documentation as a dataset
Browse files Browse the repository at this point in the history
- this includes patterns, languages and data formats
  • Loading branch information
orpiske committed Aug 2, 2024
1 parent a715d84 commit 8c8d530
Show file tree
Hide file tree
Showing 5 changed files with 83 additions and 39 deletions.
24 changes: 24 additions & 0 deletions scripts/convert-file.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/bin/bash

# Based on https://tinyapps.org/blog/201701240700_convert_asciidoc_to_markdown.html

simpleName=$(basename $1)
inputFile=$1
xmlFile=${inputFile/.adoc/.xml}
markDownFile=${inputFile/.adoc/.md}

logDir=$2
log=$logDir/$simpleName.log

printf "AsciiDoc to XML conversion for %s\n" "${simpleName}"
asciidoc -b docbook "${inputFile}" > "$log" 2>&1
if [ $? -ne 0 ] ; then
printf "Failed AsciiDoc to XML conversion for %s\n" "${inputFile}" | tee -a "$log"
fi

printf "XML to Markdown conversion for %s\n" "${simpleName}"
pandoc -f docbook -t markdown_strict "${xmlFile}" -o "${markDownFile}" >> "$log" 2>&1
if [ $? -ne 0 ] ; then
printf "Failed XML to Markdown conversion for %s\n" "$1" | tee -a "$log"
fi

14 changes: 9 additions & 5 deletions scripts/prepare-docs-for-dataset.sh
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
#!/bin/bash

CODE_DIR=${1:-"$HOME/code/java/camel}"}
install_path=$(dirname "$0")

# Based on https://tinyapps.org/blog/201701240700_convert_asciidoc_to_markdown.html
logDir=$(mktemp -d)

# Step 1: convert all component documentation to a docbook file
find $CODE_DIR -type f -iname '*-component.adoc' -ipath '*src/main/docs*' -exec asciidoc -b docbook {} \;

# Step 2: convert the docbook files to Markdown
find $CODE_DIR -type f -iname '*-component.xml' -ipath '*src/main/docs*' -exec pandoc -f docbook -t markdown_strict {} -o {}.md \;
find "$CODE_DIR" -type f -iname '*.adoc' -ipath '*src/main/docs*' -exec "${install_path}"/convert-file.sh {} "$logDir" \; | tee $logDir/conversion.log

numDocs=$(find "$CODE_DIR" -type f -iname '*.adoc' -ipath '*src/main/docs*' | wc -l)
numConverted=$(find "$CODE_DIR" -type f -iname '*.xml' -ipath '*src/main/docs*' | wc -l)
printf "Conversion complete. Checking results: converted %s of %s\n" "$numConverted" "$numDocs"
printf "Check %s for results and failures" "$logDir/conversion.log"
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,12 @@
import org.apache.camel.catalog.CamelCatalog;
import org.apache.camel.catalog.DefaultCamelCatalog;
import org.apache.camel.jbang.ai.data.documentation.ComponentDocumentationVisitor;
import org.apache.camel.jbang.ai.data.documentation.GenericDocumentationVisitor;
import org.apache.camel.jbang.ai.util.CatalogUtil;
import org.apache.camel.jbang.ai.util.parsers.DocumentationParser;
import org.apache.camel.jbang.ai.util.parsers.MarkdownParser;
import org.apache.commons.io.FileUtils;
import org.commonmark.node.AbstractVisitor;

public class DocumentationProcessor {
private final String sourcePath;
Expand All @@ -27,25 +29,21 @@ public DocumentationProcessor(String sourcePath) {

// We only want the documentation from the source code (not the ones copied during build)
public boolean isAdoc(Path p) {
return p.toFile().getName().endsWith(".xml.md")
&& p.toAbsolutePath().toString().contains("src/main/docs")
&& p.toAbsolutePath().toString().contains("components")
&& !p.toAbsolutePath().toString().contains("tooling")
&& !p.toAbsolutePath().toString().contains("dsl")
&& isComponent(p);
return p.toFile().getName().endsWith(".md")
&& p.toAbsolutePath().toString().contains("src/main/docs");
}

// We only want the components
public boolean isComponent(Path p) {
return p.toFile().getName().contains("-component");
}

public boolean parse(Path doc, String componentName) {
public boolean parse(Path doc, String componentName, AbstractVisitor visitor) {
try {
final String content = FileUtils.readFileToString(doc.toFile());

DocumentationParser parser = new MarkdownParser();
final String parsed = parser.parse(content, new ComponentDocumentationVisitor(catalog, componentName));
final String parsed = parser.parse(content, visitor);

if (parsed != null) {

Expand Down Expand Up @@ -76,12 +74,15 @@ public void process(int startFrom) throws InterruptedException {
int skippedCount = 0;
for (int i = startFrom; i < documentTotal; i++) {
Path doc = docs.get(i);
final String componentName = doc.getFileName().toString().replaceAll("-component.*", "");
final String fileName = doc.getFileName().toString();

final String componentName = fileName.replaceAll("-component.*", "");

System.out.printf("[%s] Processing document %d of %d %s for %s: ", CatalogUtil.currentTime(),
i + 1, documentTotal, doc.getFileName(), componentName);

if (parse(doc, componentName)) {
AbstractVisitor visitor = getVisitorForDocumentType(fileName, componentName);
if (parse(doc, componentName, visitor)) {
generatedCount++;
System.out.printf("done%n");
} else {
Expand All @@ -91,4 +92,12 @@ public void process(int startFrom) throws InterruptedException {
}
System.out.printf("[%s] Generated: %d. Skipped: %d%n", CatalogUtil.currentTime(), generatedCount, skippedCount);
}

private AbstractVisitor getVisitorForDocumentType(String fileName, String componentName) {
if (fileName.contains("-component")) {
return new ComponentDocumentationVisitor(catalog, componentName);
}

return new GenericDocumentationVisitor(catalog, componentName);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,9 @@
import org.commonmark.ext.gfm.tables.TableHead;
import org.commonmark.ext.gfm.tables.TableRow;
import org.commonmark.node.AbstractVisitor;
import org.commonmark.node.Code;
import org.commonmark.node.CustomNode;
import org.commonmark.node.Document;
import org.commonmark.node.Emphasis;
import org.commonmark.node.HardLineBreak;
import org.commonmark.node.Heading;
import org.commonmark.node.SoftLineBreak;
import org.commonmark.node.Text;

public class ComponentDocumentationVisitor extends AbstractVisitor {
Expand All @@ -30,26 +26,6 @@ public ComponentDocumentationVisitor(CamelCatalog catalog, String componentName)
this.componentName = componentName;
}

@Override
public void visit(Emphasis emphasis) {
super.visit(emphasis);
}

@Override
public void visit(CustomNode customNode) {
super.visit(customNode);
}

@Override
public void visit(Code code) {
super.visit(code);
}

@Override
public void visit(Text text) {
super.visit(text);
}

@Override
public void visit(Document document) {
super.visit(document);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
package org.apache.camel.jbang.ai.data.documentation;

import org.apache.camel.catalog.CamelCatalog;
import org.apache.camel.util.StringHelper;
import org.commonmark.node.AbstractVisitor;
import org.commonmark.node.Document;
import org.commonmark.node.Heading;
import org.commonmark.node.Text;

public class GenericDocumentationVisitor extends AbstractVisitor {
private final CamelCatalog catalog;
private final String componentName;

public GenericDocumentationVisitor(CamelCatalog catalog, String componentName) {
this.catalog = catalog;
this.componentName = componentName;
}

@Override
public void visit(Document document) {
super.visit(document);

final Heading title = new Heading();

title.setLevel(1);
Text text = new Text(StringHelper.capitalize(componentName));
title.appendChild(text);

document.prependChild(title);
}
}

0 comments on commit 8c8d530

Please sign in to comment.