diff --git a/.changeset/add-wc-l-countlines.md b/.changeset/add-wc-l-countlines.md new file mode 100644 index 0000000000..93c46bbc2a --- /dev/null +++ b/.changeset/add-wc-l-countlines.md @@ -0,0 +1,6 @@ +--- +"@platforma-open/milaboratories.software-ptexter": minor +"@platforma-sdk/workflow-tengo": minor +--- + +Add countLines function with high-performance Polars line counting and regex filtering support diff --git a/lib/ptexter/README.md b/lib/ptexter/README.md new file mode 100644 index 0000000000..bddc1918ad --- /dev/null +++ b/lib/ptexter/README.md @@ -0,0 +1,29 @@ +# @platforma-open/milaboratories.software-ptexter + +Text processing utilities backend for Platforma workflows. + +## Overview + +This package provides Python-based text processing tools that serve as the backend implementation for the Platforma `txt` library. The utilities in this package are designed to be called from Tengo workflows through the corresponding frontend library located at `sdk/workflow-tengo/src/txt/`. + +## Architecture + +- **Backend (this package)**: Python scripts that perform the actual text processing operations +- **Frontend**: Tengo library (`txt`) that provides a convenient workflow API and calls these backend utilities + +## Usage + +This package is typically not used directly. Instead, use the `txt` library in your Tengo workflows: + +```tengo +txt := import(":txt") + +// The txt library will automatically call the appropriate ptexter backend utilities +result := txt.head(inputs.myFile, {lines: 10}) +``` + +The backend utilities are packaged as Platforma software artifacts and automatically managed by the platform's execution environment. + +## Development + +This package follows the standard Platforma software packaging conventions and is built using the `@platforma-sdk/package-builder` toolchain. diff --git a/lib/ptexter/package.json b/lib/ptexter/package.json index 97b595c2a2..e334946f88 100644 --- a/lib/ptexter/package.json +++ b/lib/ptexter/package.json @@ -25,7 +25,7 @@ "environment": "@platforma-open/milaboratories.runenv-python-3:3.12.6", "dependencies": { "toolset": "pip", - "requirements": "requirements.txt" + "requirements": "requirements-head.txt" }, "root": "./src" }, @@ -34,6 +34,24 @@ "{pkg}/phead-lines.py" ] } + }, + "wc-l": { + "binary": { + "artifact": { + "type": "python", + "registry": "platforma-open", + "environment": "@platforma-open/milaboratories.runenv-python-3:3.12.6", + "dependencies": { + "toolset": "pip", + "requirements": "requirements-wc-l.txt" + }, + "root": "./src" + }, + "cmd": [ + "python", + "{pkg}/wc-l.py" + ] + } } } } diff --git a/lib/ptexter/src/requirements-head.txt b/lib/ptexter/src/requirements-head.txt new file mode 100644 index 0000000000..41443ecba5 --- /dev/null +++ b/lib/ptexter/src/requirements-head.txt @@ -0,0 +1 @@ +# No external dependencies for head endpoint - uses only Python standard library diff --git a/lib/ptexter/src/requirements-wc-l.txt b/lib/ptexter/src/requirements-wc-l.txt new file mode 100644 index 0000000000..f426912a01 --- /dev/null +++ b/lib/ptexter/src/requirements-wc-l.txt @@ -0,0 +1,2 @@ +# Requirements for wc-l endpoint - high performance line counting +polars-lts-cpu==1.30.0 diff --git a/lib/ptexter/src/requirements.txt b/lib/ptexter/src/requirements.txt index 34132703e8..f426912a01 100644 --- a/lib/ptexter/src/requirements.txt +++ b/lib/ptexter/src/requirements.txt @@ -1 +1,2 @@ -# No external dependencies - uses only Python standard library +# Requirements for wc-l endpoint - high performance line counting +polars-lts-cpu==1.30.0 diff --git a/lib/ptexter/src/wc-l.py b/lib/ptexter/src/wc-l.py new file mode 100644 index 0000000000..6366546c64 --- /dev/null +++ b/lib/ptexter/src/wc-l.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 +""" +wc-l.py - Count lines in a text file with optional regex filtering + +High-performance line counter using Polars with optional regex pattern to ignore certain lines. +Outputs just the count number (no trailing newline) to the specified output file. +""" + +import argparse +import sys +import re +from pathlib import Path +import polars as pl + + +def count_lines_optimized(input_file: str, ignore_pattern: str = None) -> int: + """ + Count lines using optimized Polars approach (best from benchmarks: 31,000+ MB/s). + + Args: + input_file: Path to input file + ignore_pattern: Optional regex pattern - lines matching this will be ignored + + Returns: + Number of lines (excluding ignored lines) + """ + # Use the optimized single-column approach from our benchmarks + df = pl.scan_csv( + input_file, + has_header=False, + separator='\x00', # Null separator to read as single column + infer_schema_length=0, + ignore_errors=True, + low_memory=True, + ) + + if ignore_pattern is None: + # Fast path - just count all lines + return df.select(pl.len()).collect().item() + else: + # Need to filter lines - read the column and apply regex filter + lines_df = df.collect() + + # Get the column (should be the first/only column) + col_name = lines_df.columns[0] + + # Filter out lines matching the ignore pattern + filtered_df = lines_df.filter( + ~pl.col(col_name).str.contains(ignore_pattern, literal=False) + ) + + return len(filtered_df) + + +def wc_lines(input_file: str, output_file: str, ignore_pattern: str = None): + """ + Count lines in input_file and write count to output_file. + + Args: + input_file: Path to input file + output_file: Path to output file (will contain just the count) + ignore_pattern: Optional regex pattern - lines matching this will be ignored + """ + try: + input_path = Path(input_file) + output_path = Path(output_file) + + if not input_path.exists(): + raise FileNotFoundError(f"Input file not found: {input_file}") + + if not input_path.is_file(): + raise ValueError(f"Input path is not a file: {input_file}") + + # Create output directory if it doesn't exist + output_path.parent.mkdir(parents=True, exist_ok=True) + + # Count lines using optimized Polars approach + line_count = count_lines_optimized(input_file, ignore_pattern) + + # Write count to output file (no trailing newline as requested) + with open(output_path, 'w', encoding='utf-8') as outfile: + outfile.write(str(line_count)) + + return line_count + + except UnicodeDecodeError as e: + raise ValueError(f"Failed to decode input file as UTF-8: {e}") from e + except IOError as e: + raise IOError(f"File I/O error: {e}") from e + except re.error as e: + raise ValueError(f"Invalid regex pattern '{ignore_pattern}': {e}") from e + + +def main(): + parser = argparse.ArgumentParser( + description='Count lines in a text file with optional regex filtering', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + python wc-l.py input.txt output.txt + python wc-l.py --ignore-pattern '^#' input.txt output.txt # Skip comment lines + python wc-l.py --ignore-pattern '^\\s*$' input.txt output.txt # Skip empty lines + """ + ) + + parser.add_argument( + '--ignore-pattern', + type=str, + help='Optional regex pattern - lines matching this pattern will be ignored' + ) + + parser.add_argument( + 'input_file', + help='Input text file path' + ) + + parser.add_argument( + 'output_file', + help='Output file path (will contain just the line count)' + ) + + args = parser.parse_args() + + # Validate regex pattern if provided + if args.ignore_pattern: + try: + re.compile(args.ignore_pattern) + except re.error as e: + print(f"Error: Invalid regex pattern '{args.ignore_pattern}': {e}", file=sys.stderr) + sys.exit(1) + + try: + line_count = wc_lines( + args.input_file, + args.output_file, + args.ignore_pattern + ) + + ignored_msg = f" (excluding lines matching '{args.ignore_pattern}')" if args.ignore_pattern else "" + print(f"Successfully counted {line_count} lines{ignored_msg} and wrote to {args.output_file}") + + except (FileNotFoundError, ValueError, IOError) as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + except KeyboardInterrupt: + print("\nOperation cancelled by user", file=sys.stderr) + sys.exit(1) + except Exception as e: + print(f"Unexpected error: {e}", file=sys.stderr) + sys.exit(1) + + +if __name__ == '__main__': + main() diff --git a/sdk/workflow-tengo/src/txt/index.lib.tengo b/sdk/workflow-tengo/src/txt/index.lib.tengo index 7ed51f1457..6a9601d454 100644 --- a/sdk/workflow-tengo/src/txt/index.lib.tengo +++ b/sdk/workflow-tengo/src/txt/index.lib.tengo @@ -65,6 +65,65 @@ head := func(fileRef, opts) { return result.getFileContent("output.txt") } +/** + * Counts lines in a text file with optional regex filtering. + * + * @param fileRef {resource} - Resource reference to the input text file + * @param ...opts {map} (optional) - Options map with optional fields: + * - ignorePattern {string} (optional): Regex pattern - lines matching this will be ignored + * @returns {number} - Number of lines in the file (excluding ignored lines) + * @example + * // Count all lines + * lineCount := txt.countLines(inputs.myFile) + * + * // Count lines ignoring comments + * lineCount := txt.countLines(inputs.myFile, {ignorePattern: "^#"}) + * + * // Count non-empty lines + * lineCount := txt.countLines(inputs.myFile, {ignorePattern: "^\\s*$"}) + */ +countLines := func(fileRef, ...opts) { + if !smart.isReference(fileRef) { + ll.panic("fileRef must be a valid resource reference. Got: %T", fileRef) + } + + if len(opts) == 0 { + opts = {} + } else if len(opts) == 1 { + opts = opts[0] + } else { + ll.panic("too many arguments") + } + + if !is_map(opts) { + ll.panic("opts must be a map or undefined. Got: %T", opts) + } + + wcSw := assets.importSoftware("@platforma-open/milaboratories.software-ptexter:wc-l") + + cmdBuilder := exec.builder(). + software(wcSw) + + if !is_undefined(opts.ignorePattern) { + if !is_string(opts.ignorePattern) { + ll.panic("opts.ignorePattern must be a string. Got: %T", opts.ignorePattern) + } + cmdBuilder = cmdBuilder. + arg("--ignore-pattern"). + arg(opts.ignorePattern) + } + + cmdBuilder = cmdBuilder. + arg("input.txt"). + arg("output.txt"). + addFile("input.txt", fileRef). + saveFileContent("output.txt") + + result := cmdBuilder.run() + return result.getFileContent("output.txt") +} + export ll.toStrict({ - head: head + head: head, + countLines: countLines }) diff --git a/tests/workflow-tengo/src/test/txt/countLines.tpl.tengo b/tests/workflow-tengo/src/test/txt/countLines.tpl.tengo new file mode 100644 index 0000000000..4fded138b9 --- /dev/null +++ b/tests/workflow-tengo/src/test/txt/countLines.tpl.tengo @@ -0,0 +1,24 @@ +// txt countLines function test template + +self := import("@platforma-sdk/workflow-tengo:tpl") +file := import("@platforma-sdk/workflow-tengo:file") +txt := import("@platforma-sdk/workflow-tengo:txt") + +self.defineOutputs(["result", "progress"]) + +self.body(func(inputs) { + importResult := file.importFile(inputs.importHandle) + + // Apply txt.countLines function with the specified options + countResult := undefined + if inputs.countOptions == false { + countResult = txt.countLines(importResult.file) + } else { + countResult = txt.countLines(importResult.file, inputs.countOptions) + } + + return { + result: countResult, + progress: importResult.handle + } +}) diff --git a/tests/workflow-tengo/src/test/txt/txt.test.ts b/tests/workflow-tengo/src/test/txt/txt.test.ts index 163d1d81f3..7d322734e6 100644 --- a/tests/workflow-tengo/src/test/txt/txt.test.ts +++ b/tests/workflow-tengo/src/test/txt/txt.test.ts @@ -167,3 +167,85 @@ tplTest( ).rejects.toThrow(/would exceed.*byte limit/); }, ); + +// Test countLines function +type CountLinesTestInput = { + name: string; + fileName: string; + countOptions?: { ignorePattern?: string }; + expectedCount: number; + handleProvider: ( + driverKit: MiddleLayerDriverKit + ) => Promise; +}; + +const countLinesCases: CountLinesTestInput[] = [ + { + name: 'count-all-lines-basic', + fileName: 'maybe_the_number_of_lines_is_the_answer.txt', + countOptions: undefined, // No options - count all lines + expectedCount: 42, // Assuming the file has 42 lines + handleProvider: async (driverKit) => { + return await driverKit.lsDriver.getLocalFileHandle( + path.resolve('../../assets/maybe_the_number_of_lines_is_the_answer.txt'), + ); + }, + }, + { + name: 'count-lines-with-comment-filter', + fileName: 'maybe_the_number_of_lines_is_the_answer.txt', + countOptions: { ignorePattern: '^#' }, // Ignore lines starting with # + expectedCount: 42, // Assuming no comment lines in this file, same count + handleProvider: async (driverKit) => { + return await driverKit.lsDriver.getLocalFileHandle( + path.resolve('../../assets/maybe_the_number_of_lines_is_the_answer.txt'), + ); + }, + }, +]; + +tplTest.for(countLinesCases)( + 'txt.countLines test: $name', + async ({ handleProvider, countOptions, expectedCount }, { helper, expect, driverKit }) => { + const importHandle = await handleProvider(driverKit); + const result = await helper.renderTemplate( + false, + 'test.txt.countLines', + ['result', 'progress'], + (tx) => ({ + importHandle: tx.createValue( + Pl.JsonObject, + JSON.stringify(importHandle), + ), + countOptions: countOptions + ? tx.createValue( + Pl.JsonObject, + JSON.stringify(countOptions), + ) + : tx.createValue(Pl.JsonObject, 'false'), + }), + ); + + const progress = result + .computeOutput('progress', (a, ctx) => { + if (a === undefined) return undefined; + return driverKit.uploadDriver.getProgressId(a.persist(), ctx); + }) + .withPreCalculatedValueTree(); + + const countResult = result + .computeOutput('result', (a) => { + if (a === undefined) return undefined; + return parseInt(a.getDataAsString()!); // countLines returns an integer + }) + .withPreCalculatedValueTree(); + + const progressStableValue = await progress.awaitStableValue(); + expect(progressStableValue).toBeDefined(); + expect(progressStableValue).toMatchObject({ done: true }); + + const countStableValue = await countResult.awaitStableValue(); + expect(countStableValue).toBeDefined(); + expect(countStableValue).toEqual(expectedCount); + }, +);