milaboratory · dbolotin · Aug 20, 2025 · DenKoren · Aug 21, 2025 · DenKoren
diff --git a/.changeset/add-wc-l-countlines.md b/.changeset/add-wc-l-countlines.md
@@ -0,0 +1,6 @@
+---
+"@platforma-open/milaboratories.software-ptexter": minor
+"@platforma-sdk/workflow-tengo": minor
+---
+
+Add countLines function with high-performance Polars line counting and regex filtering support
diff --git a/lib/ptexter/README.md b/lib/ptexter/README.md
@@ -0,0 +1,29 @@
+# @platforma-open/milaboratories.software-ptexter
+
+Text processing utilities backend for Platforma workflows.
+
+## Overview
+
+This package provides Python-based text processing tools that serve as the backend implementation for the Platforma `txt` library. The utilities in this package are designed to be called from Tengo workflows through the corresponding frontend library located at `sdk/workflow-tengo/src/txt/`.
+
+## Architecture
+
+- **Backend (this package)**: Python scripts that perform the actual text processing operations
+- **Frontend**: Tengo library (`txt`) that provides a convenient workflow API and calls these backend utilities
+
+## Usage
+
+This package is typically not used directly. Instead, use the `txt` library in your Tengo workflows:
+
+```tengo
+txt := import(":txt")
+
+// The txt library will automatically call the appropriate ptexter backend utilities
+result := txt.head(inputs.myFile, {lines: 10})
+```
+
+The backend utilities are packaged as Platforma software artifacts and automatically managed by the platform's execution environment.
+
+## Development
+
+This package follows the standard Platforma software packaging conventions and is built using the `@platforma-sdk/package-builder` toolchain.
diff --git a/lib/ptexter/package.json b/lib/ptexter/package.json
@@ -25,7 +25,7 @@
             "environment": "@platforma-open/milaboratories.runenv-python-3:3.12.6",
             "dependencies": {
               "toolset": "pip",
-              "requirements": "requirements.txt"
+              "requirements": "requirements-head.txt"
             },
             "root": "./src"
           },
@@ -34,6 +34,24 @@
             "{pkg}/phead-lines.py"
           ]
         }
+      },
+      "wc-l": {
+        "binary": {
+          "artifact": {
+            "type": "python",
+            "registry": "platforma-open",
+            "environment": "@platforma-open/milaboratories.runenv-python-3:3.12.6",
+            "dependencies": {
+              "toolset": "pip",
+              "requirements": "requirements-wc-l.txt"
+            },
+            "root": "./src"
+          },
+          "cmd": [
+            "python",
+            "{pkg}/wc-l.py"
+          ]
+        }
       }
     }
   }

diff --git a/lib/ptexter/src/requirements-head.txt b/lib/ptexter/src/requirements-head.txt
@@ -0,0 +1 @@
+# No external dependencies for head endpoint - uses only Python standard library
diff --git a/lib/ptexter/src/requirements-wc-l.txt b/lib/ptexter/src/requirements-wc-l.txt
@@ -0,0 +1,2 @@
+# Requirements for wc-l endpoint - high performance line counting
+polars-lts-cpu==1.30.0
diff --git a/lib/ptexter/src/requirements.txt b/lib/ptexter/src/requirements.txt
@@ -1 +1,2 @@
-# No external dependencies - uses only Python standard library
+# Requirements for wc-l endpoint - high performance line counting
+polars-lts-cpu==1.30.0
diff --git a/lib/ptexter/src/wc-l.py b/lib/ptexter/src/wc-l.py
@@ -0,0 +1,154 @@
+#!/usr/bin/env python3
+"""
+wc-l.py - Count lines in a text file with optional regex filtering
+
+High-performance line counter using Polars with optional regex pattern to ignore certain lines.
+Outputs just the count number (no trailing newline) to the specified output file.
+"""
+
+import argparse
+import sys
+import re
+from pathlib import Path
+import polars as pl
+
+
+def count_lines_optimized(input_file: str, ignore_pattern: str = None) -> int:
+    """
+    Count lines using optimized Polars approach (best from benchmarks: 31,000+ MB/s).
+    
+    Args:
+        input_file: Path to input file
+        ignore_pattern: Optional regex pattern - lines matching this will be ignored
+        
+    Returns:
+        Number of lines (excluding ignored lines)
+    """
+    # Use the optimized single-column approach from our benchmarks
+    df = pl.scan_csv(
+        input_file,
+        has_header=False,
+        separator='\x00',  # Null separator to read as single column
+        infer_schema_length=0,
+        ignore_errors=True,
+        low_memory=True,
+    )
+
+    if ignore_pattern is None:
+        # Fast path - just count all lines
+        return df.select(pl.len()).collect().item()
+    else:
+        # Need to filter lines - read the column and apply regex filter
+        lines_df = df.collect()
+
+        # Get the column (should be the first/only column)
+        col_name = lines_df.columns[0]
+
+        # Filter out lines matching the ignore pattern
+        filtered_df = lines_df.filter(
+            ~pl.col(col_name).str.contains(ignore_pattern, literal=False)
+        )
+
+        return len(filtered_df)
-        lines_df = df.collect()
-        
-        # Get the column (should be the first/only column)
-        col_name = lines_df.columns[0]
-        
-        # Filter out lines matching the ignore pattern
-        filtered_df = lines_df.filter(
-            ~pl.col(col_name).str.contains(ignore_pattern, literal=False)
-        )
-        
-        return len(filtered_df)
+        # Perform filtering lazily on the scanned dataframe
+        # to avoid loading the entire file into memory.
+        col_name = df.columns[0]
+
+        filtered_lazy_df = df.filter(
+            ~pl.col(col_name).str.contains(ignore_pattern, literal=False)
+        )
+
+        # Collect only the final count, which is very memory-efficient.
+        return filtered_lazy_df.select(pl.len()).collect().item()
-        lines_df = df.collect()
-        
-        # Get the column (should be the first/only column)
-        col_name = lines_df.columns[0]
-        
-        # Filter out lines matching the ignore pattern
-        filtered_df = lines_df.filter(
-            ~pl.col(col_name).str.contains(ignore_pattern, literal=False)
-        )
-        
-        return len(filtered_df)
+        # Perform filtering lazily on the scanned dataframe
+        # to avoid loading the entire file into memory.
+        col_name = df.columns[0]
+
+        filtered_lazy_df = df.filter(
+            ~pl.col(col_name).str.contains(ignore_pattern, literal=False)
+        )
+
+        # Collect only the final count, which is very memory-efficient.
+        return filtered_lazy_df.select(pl.len()).collect().item()
+
+
+def wc_lines(input_file: str, output_file: str, ignore_pattern: str = None):
+    """
+    Count lines in input_file and write count to output_file.
+    
+    Args:
+        input_file: Path to input file
+        output_file: Path to output file (will contain just the count)
+        ignore_pattern: Optional regex pattern - lines matching this will be ignored
+    """
+    try:
+        input_path = Path(input_file)
+        output_path = Path(output_file)
+
+        if not input_path.exists():
+            raise FileNotFoundError(f"Input file not found: {input_file}")
+
+        if not input_path.is_file():
+            raise ValueError(f"Input path is not a file: {input_file}")
+
+        # Create output directory if it doesn't exist
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+
+        # Count lines using optimized Polars approach
+        line_count = count_lines_optimized(input_file, ignore_pattern)
+
+        # Write count to output file (no trailing newline as requested)
+        with open(output_path, 'w', encoding='utf-8') as outfile:
+            outfile.write(str(line_count))
+
+        return line_count
+
+    except UnicodeDecodeError as e:
+        raise ValueError(f"Failed to decode input file as UTF-8: {e}") from e
+    except IOError as e:
+        raise IOError(f"File I/O error: {e}") from e
+    except re.error as e:
+        raise ValueError(f"Invalid regex pattern '{ignore_pattern}': {e}") from e
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Count lines in a text file with optional regex filtering',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  python wc-l.py input.txt output.txt
+  python wc-l.py --ignore-pattern '^#' input.txt output.txt  # Skip comment lines
+  python wc-l.py --ignore-pattern '^\\s*$' input.txt output.txt  # Skip empty lines
+        """
+    )
+
+    parser.add_argument(
+        '--ignore-pattern',
+        type=str,
+        help='Optional regex pattern - lines matching this pattern will be ignored'
+    )
+
+    parser.add_argument(
+        'input_file',
+        help='Input text file path'
+    )
+
+    parser.add_argument(
+        'output_file', 
+        help='Output file path (will contain just the line count)'
+    )
+
+    args = parser.parse_args()
+
+    # Validate regex pattern if provided
+    if args.ignore_pattern:
+        try:
+            re.compile(args.ignore_pattern)
+        except re.error as e:
+            print(f"Error: Invalid regex pattern '{args.ignore_pattern}': {e}", file=sys.stderr)
+            sys.exit(1)
+
+    try:
+        line_count = wc_lines(
+            args.input_file, 
+            args.output_file,
+            args.ignore_pattern
+        )
+
+        ignored_msg = f" (excluding lines matching '{args.ignore_pattern}')" if args.ignore_pattern else ""
+        print(f"Successfully counted {line_count} lines{ignored_msg} and wrote to {args.output_file}")
+
+    except (FileNotFoundError, ValueError, IOError) as e:
+        print(f"Error: {e}", file=sys.stderr)
+        sys.exit(1)
+    except KeyboardInterrupt:
+        print("\nOperation cancelled by user", file=sys.stderr)
+        sys.exit(1)
+    except Exception as e:
+        print(f"Unexpected error: {e}", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/sdk/workflow-tengo/src/txt/index.lib.tengo b/sdk/workflow-tengo/src/txt/index.lib.tengo
@@ -65,6 +65,65 @@ head := func(fileRef, opts) {
     return result.getFileContent("output.txt")
 }
 
+/**
+ * Counts lines in a text file with optional regex filtering.
+ *
+ * @param fileRef {resource} - Resource reference to the input text file
+ * @param ...opts {map} (optional) - Options map with optional fields:
+ *   - ignorePattern {string} (optional): Regex pattern - lines matching this will be ignored
+ * @returns {number} - Number of lines in the file (excluding ignored lines)
+ * @example
+ *   // Count all lines
+ *   lineCount := txt.countLines(inputs.myFile)
+ *
+ *   // Count lines ignoring comments
+ *   lineCount := txt.countLines(inputs.myFile, {ignorePattern: "^#"})
+ *
+ *   // Count non-empty lines
+ *   lineCount := txt.countLines(inputs.myFile, {ignorePattern: "^\\s*$"})
+ */
+countLines := func(fileRef, ...opts) {
+    if !smart.isReference(fileRef) {
+        ll.panic("fileRef must be a valid resource reference. Got: %T", fileRef)
+    }
+
+    if len(opts) == 0 {
+        opts = {}
+    } else if len(opts) == 1 {
+        opts = opts[0]
+    } else {
+        ll.panic("too many arguments")
+    }
+
+    if !is_map(opts) {
+        ll.panic("opts must be a map or undefined. Got: %T", opts)
+    }
+
+    wcSw := assets.importSoftware("@platforma-open/milaboratories.software-ptexter:wc-l")
+
+    cmdBuilder := exec.builder().
+        software(wcSw)
+
+    if !is_undefined(opts.ignorePattern) {
+        if !is_string(opts.ignorePattern) {
+            ll.panic("opts.ignorePattern must be a string. Got: %T", opts.ignorePattern)
+        }
+        cmdBuilder = cmdBuilder.
+            arg("--ignore-pattern").
+            arg(opts.ignorePattern)
+    }
+
+    cmdBuilder = cmdBuilder.
+        arg("input.txt").
+        arg("output.txt").
+        addFile("input.txt", fileRef).
+        saveFileContent("output.txt")
+
+    result := cmdBuilder.run()
+    return result.getFileContent("output.txt")
+}
+
 export ll.toStrict({
-    head: head
+    head: head,
+    countLines: countLines
 })
diff --git a/tests/workflow-tengo/src/test/txt/countLines.tpl.tengo b/tests/workflow-tengo/src/test/txt/countLines.tpl.tengo
@@ -0,0 +1,24 @@
+// txt countLines function test template
+
+self := import("@platforma-sdk/workflow-tengo:tpl")
+file := import("@platforma-sdk/workflow-tengo:file")
+txt := import("@platforma-sdk/workflow-tengo:txt")
+
+self.defineOutputs(["result", "progress"])
+
+self.body(func(inputs) {
+    importResult := file.importFile(inputs.importHandle)
+
+    // Apply txt.countLines function with the specified options
+    countResult := undefined
+    if inputs.countOptions == false {
+        countResult = txt.countLines(importResult.file)
+    } else {
+        countResult = txt.countLines(importResult.file, inputs.countOptions)
+    }
+
+    return {
+        result: countResult,
+        progress: importResult.handle
+    }
+})
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		# No external dependencies for head endpoint - uses only Python standard library
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# Requirements for wc-l endpoint - high performance line counting
Copy link Member DenKoren Aug 21, 2025 • edited Loading Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. Do we really need them to be different for wc-l and head?
		polars-lts-cpu==1.30.0