Skip to content

Commit

Permalink
add parquet file output to the data command
Browse files Browse the repository at this point in the history
  • Loading branch information
rinaldodev authored and orpiske committed Jul 30, 2024
1 parent faf59db commit 2c3303a
Show file tree
Hide file tree
Showing 6 changed files with 101 additions and 4 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,8 @@ java -jar target/camel-jbang-plugin-explain-4.7.0-SNAPSHOT-jar-with-dependencies
You can generate LLM training datasets from the catalog information.
JSON and Parquet files are generated in the `dataset` directory.
Generate training data using the component information:
```shell
java -jar target/camel-jbang-plugin-explain-4.7.0-SNAPSHOT-jar-with-dependencies.jar data --model-name --data-type components mistral:latest
Expand Down
26 changes: 26 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@
<main.class>org.apache.camel.standalone.Main</main.class>
<velocity-version>2.3</velocity-version>
<commonmark-version>0.22.0</commonmark-version>

<avro-version>1.11.3</avro-version>
<parquet-avro-version>1.14.1</parquet-avro-version>
<hadoop-version>3.4.0</hadoop-version>
</properties>

<dependencyManagement>
Expand Down Expand Up @@ -77,6 +81,28 @@
<version>${commonmark-version}</version>
</dependency>

<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-avro</artifactId>
<version>${parquet-avro-version}</version>
</dependency>
<dependency>
<groupId>org.apache.avro</groupId>
<artifactId>avro</artifactId>
<version>${avro-version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hadoop-version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>${hadoop-version}</version>
</dependency>


</dependencies>

<profiles>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ protected void processRecords(int startFrom, List<String> componentNames, int to
protected void processOption(
List<AlpacaRecord> alpacaRecords, String componentName,
List<? extends BaseOptionModel> optionModels, String type) {
int componentOptionCount = 0;
int componentOptionCount = 1;
final int componentOptionTotal = optionModels.size();
for (BaseOptionModel optionModel : optionModels) {
StopWatch watch = new StopWatch();
Expand Down
15 changes: 12 additions & 3 deletions src/main/java/org/apache/camel/jbang/ai/util/CatalogUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import java.io.File;
import java.io.IOException;
import java.nio.file.Path;
import java.time.Instant;
import java.time.ZoneId;
import java.time.format.DateTimeFormatter;
Expand All @@ -13,22 +14,30 @@
import org.apache.camel.jbang.ai.types.AlpacaRecord;
import org.apache.camel.tooling.model.BaseOptionModel;

public class CatalogUtil {
public final class CatalogUtil {
private static final String DATASET_DIR = "dataset";
private static final String PATTERN_FORMAT = "HH:mm:ss";

private CatalogUtil() {
throw new IllegalStateException("Util final class should be instantiated.");
}

public static void saveRecords(List<AlpacaRecord> alpacaRecords, String componentName) {
if (!alpacaRecords.isEmpty()) {
ObjectMapper mapper = new ObjectMapper();
mapper.enable(SerializationFeature.INDENT_OUTPUT);

final File file = new File("dataset", String.format("camel-%s.json", componentName));
final File file = new File(DATASET_DIR, String.format("camel-%s.json", componentName));
file.getParentFile().mkdirs();

try {
mapper.writeValue(file, alpacaRecords);
} catch (IOException e) {
throw new RuntimeException(e);
throw new RuntimeException("Failed to write json file", e);
}

Path parquetOutput = Path.of(DATASET_DIR, String.format("camel-%s.parquet", componentName));
ParquetUtil.saveParquet(alpacaRecords, parquetOutput);
}
}

Expand Down
50 changes: 50 additions & 0 deletions src/main/java/org/apache/camel/jbang/ai/util/ParquetUtil.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
package org.apache.camel.jbang.ai.util;

import java.io.IOException;
import java.nio.file.Path;
import java.util.List;

import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
import org.apache.camel.jbang.ai.types.AlpacaRecord;
import org.apache.parquet.avro.AvroParquetWriter;
import org.apache.parquet.io.LocalOutputFile;

public final class ParquetUtil {

private static final String SCHEMA_FILE = "/training-set-schema.avsc";

private ParquetUtil() {
throw new IllegalStateException("Util final class should be instantiated.");
}

public static void saveParquet(List<AlpacaRecord> alpacaRecords, Path outputPath) {

Schema schema;
try (var schemaContent = ParquetUtil.class.getResourceAsStream(SCHEMA_FILE)) {
schema = new Schema.Parser().parse(schemaContent);
} catch (IOException e) {
throw new RuntimeException("Failed to load Parquet schema", e);
}

var outputFile = new LocalOutputFile(outputPath);
var builder = AvroParquetWriter
.<GenericRecord>builder(outputFile)
.withSchema(schema);

try (var writer = builder.build()) {
for (AlpacaRecord ar : alpacaRecords) {
GenericRecord r = new GenericData.Record(schema);
r.put("input", ar.getInput());
r.put("instruction", ar.getInstruction());
r.put("output", ar.getOutput());
writer.write(r);
}
} catch (IOException e) {
throw new RuntimeException("Failed to write Parquet file", e);
}

}

}
10 changes: 10 additions & 0 deletions src/main/resources/training-set-schema.avsc
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"namespace": "org.apache.camel",
"type": "record",
"name": "AlpacaRecord",
"fields": [
{"name": "instruction", "type": "string"},
{"name": "input", "type": "string"},
{"name": "output", "type": "string"}
]
}

0 comments on commit 2c3303a

Please sign in to comment.