Skip to content

Commit

Permalink
Enhance: Add comments to chunking-related code (#1106) (#1115)
Browse files Browse the repository at this point in the history
* add logs for chunker parameter parser

Signed-off-by: yuye-aws <[email protected]>

* add logs for delimiter chunker

Signed-off-by: yuye-aws <[email protected]>

* add comments for delimiter chunker

Signed-off-by: yuye-aws <[email protected]>

* update comments for delimiter chunker

Signed-off-by: yuye-aws <[email protected]>

* update comments for chunker interface

Signed-off-by: yuye-aws <[email protected]>

* add comments for chunker facdtory

Signed-off-by: yuye-aws <[email protected]>

* update comments for chunker interface

Signed-off-by: yuye-aws <[email protected]>

---------

Signed-off-by: yuye-aws <[email protected]>
(cherry picked from commit 3130de3)

Co-authored-by: yuye-aws <[email protected]>
  • Loading branch information
opensearch-trigger-bot[bot] and yuye-aws authored Jan 16, 2025
1 parent 831d962 commit 763fa5e
Show file tree
Hide file tree
Showing 5 changed files with 90 additions and 22 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,16 @@
*/
public interface Chunker {

/** Field name for specifying the maximum chunk limit in the configuration. */
String MAX_CHUNK_LIMIT_FIELD = "max_chunk_limit";

/** Field name for tracking the count of chunked strings. */
String CHUNK_STRING_COUNT_FIELD = "chunk_string_count";

/** Default maximum number of chunks allowed (100). */
int DEFAULT_MAX_CHUNK_LIMIT = 100;

/** Special value (-1) indicating that chunk limiting is disabled. */
int DISABLED_MAX_CHUNK_LIMIT = -1;

/**
Expand All @@ -42,6 +49,7 @@ public interface Chunker {
* @param chunkResultSize the size of chunking result
* @param runtimeMaxChunkLimit runtime max_chunk_limit, used to check with chunkResultSize
* @param chunkStringCount runtime chunk_string_count, used to check with chunkResultSize
* @return true if adding the new chunks would exceed the limit, false otherwise
*/
static boolean checkRunTimeMaxChunkLimit(int chunkResultSize, int runtimeMaxChunkLimit, int chunkStringCount) {
return runtimeMaxChunkLimit != DISABLED_MAX_CHUNK_LIMIT && chunkResultSize + chunkStringCount >= runtimeMaxChunkLimit;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,16 @@ private ChunkerFactory() {} // no instance of this factory class
DelimiterChunker::new
);

/** Set of supported chunker algorithm types */
public static Set<String> CHUNKER_ALGORITHMS = CHUNKERS_CONSTRUCTORS.keySet();

/**
* Creates a new Chunker instance based on the specified type and parameters.
*
* @param type the type of chunker to create
* @param parameters configuration parameters for the chunker
* @return a new Chunker instance configured with the given parameters
*/
public static Chunker create(final String type, final Map<String, Object> parameters) {
Function<Map<String, Object>, Chunker> chunkerConstructionFunction = CHUNKERS_CONSTRUCTORS.get(type);
// chunkerConstructionFunction is not null because we have validated the type in text chunking processor
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,12 @@ public final class ChunkerParameterParser {
private ChunkerParameterParser() {} // no instance of this util class

/**
* Parse String type parameter.
* Throw IllegalArgumentException if parameter is not a string or an empty string.
* Parses and validates a string parameter from the parameters map.
*
* @param parameters The map containing chunking parameters
* @param fieldName The name of the field to extract from the parameters map
* @return The validated string value from the parameters map
* @throws IllegalArgumentException if the parameter is not a string or is empty
*/
public static String parseString(final Map<String, Object> parameters, final String fieldName) {
Object fieldValue = parameters.get(fieldName);
Expand All @@ -36,9 +40,13 @@ public static String parseString(final Map<String, Object> parameters, final Str
}

/**
* Parse String type parameter.
* Return default value if the parameter is missing.
* Throw IllegalArgumentException if parameter is not a string or an empty string.
* Parses and validates a string parameter from the parameters map with fallback to a default value.
*
* @param parameters The map containing chunking parameters
* @param fieldName The name of the field to extract from the parameters map
* @param defaultValue The default value to return if the parameter is not present
* @return The validated string value from the parameters map if present, otherwise the default value
* @throws IllegalArgumentException if the parameter is present but is not a string or is empty
*/
public static String parseStringWithDefault(final Map<String, Object> parameters, final String fieldName, final String defaultValue) {
if (!parameters.containsKey(fieldName)) {
Expand All @@ -49,8 +57,12 @@ public static String parseStringWithDefault(final Map<String, Object> parameters
}

/**
* Parse integer type parameter with default value.
* Throw IllegalArgumentException if the parameter is not an integer.
* Parses and validates an integer value from the parameters map.
*
* @param parameters The map containing chunking parameters
* @param fieldName The name of the field to extract from the parameters map
* @return The parsed integer value from the parameters map
* @throws IllegalArgumentException if the parameter is not an integer or is empty
*/
public static int parseInteger(final Map<String, Object> parameters, final String fieldName) {
String fieldValueString = parameters.get(fieldName).toString();
Expand All @@ -64,9 +76,13 @@ public static int parseInteger(final Map<String, Object> parameters, final Strin
}

/**
* Parse integer type parameter with default value.
* Return default value if the parameter is missing.
* Throw IllegalArgumentException if the parameter is not an integer.
* Parses and validates an integer parameter from the parameters map with fallback to a default value.
*
* @param parameters The map containing chunking parameters
* @param fieldName The name of the field to extract from the parameters map
* @param defaultValue The default value to return if the parameter is not present
* @return The integer value from the parameters map if present, otherwise the default value
* @throws IllegalArgumentException if the parameter is present but cannot be converted to an integer
*/
public static int parseIntegerWithDefault(final Map<String, Object> parameters, final String fieldName, final int defaultValue) {
if (!parameters.containsKey(fieldName)) {
Expand All @@ -77,9 +93,12 @@ public static int parseIntegerWithDefault(final Map<String, Object> parameters,
}

/**
* Parse integer type parameter with positive value.
* Return default value if the parameter is missing.
* Throw IllegalArgumentException if the parameter is not a positive integer.
* Parses and validates a positive integer parameter from the parameters map.
*
* @param parameters The map containing chunking parameters
* @param fieldName The name of the field to extract from the parameters map
* @return The parsed positive integer value
* @throws IllegalArgumentException if the parameter is not a positive integer or cannot be converted to an integer
*/
public static int parsePositiveInteger(final Map<String, Object> parameters, final String fieldName) {
int fieldValueInt = parseInteger(parameters, fieldName);
Expand All @@ -90,9 +109,13 @@ public static int parsePositiveInteger(final Map<String, Object> parameters, fin
}

/**
* Parse integer type parameter with positive value.
* Return default value if the parameter is missing.
* Throw IllegalArgumentException if the parameter is not a positive integer.
* Parses and validates a positive integer parameter from the parameters map with fallback to a default value.
*
* @param parameters The map containing chunking parameters
* @param fieldName The name of the field to extract from the parameters map
* @param defaultValue The default value to return if the parameter is not present
* @return The positive integer value from the parameters map if present, otherwise the default value
* @throws IllegalArgumentException if the parameter is present but is not a positive integer
*/
public static int parsePositiveIntegerWithDefault(
final Map<String, Object> parameters,
Expand All @@ -107,8 +130,12 @@ public static int parsePositiveIntegerWithDefault(
}

/**
* Parse double type parameter.
* Throw IllegalArgumentException if parameter is not a double.
* Parses and validates a double value from the parameters map.
*
* @param parameters The map containing chunking parameters
* @param fieldName The name of the field to extract from the parameters map
* @return The parsed double value
* @throws IllegalArgumentException if the parameter cannot be converted to a double
*/
public static double parseDouble(final Map<String, Object> parameters, final String fieldName) {
String fieldValueString = parameters.get(fieldName).toString();
Expand All @@ -122,9 +149,13 @@ public static double parseDouble(final Map<String, Object> parameters, final Str
}

/**
* Parse double type parameter.
* Return default value if the parameter is missing.
* Throw IllegalArgumentException if parameter is not a double.
* Parses and validates a double value from the parameters map with fallback to a default value.
*
* @param parameters The map containing chunking parameters
* @param fieldName The name of the field to extract from the parameters map
* @param defaultValue The default value to return if the parameter is not present
* @return The double value from the parameters map if present, otherwise the default value
* @throws IllegalArgumentException if the parameter is present but cannot be converted to a double
*/
public static double parseDoubleWithDefault(final Map<String, Object> parameters, final String fieldName, final double defaultValue) {
if (!parameters.containsKey(fieldName)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,22 @@
*/
public final class DelimiterChunker implements Chunker {

/** The identifier for the delimiter chunking algorithm. */
public static final String ALGORITHM_NAME = "delimiter";

/** The parameter field name for specifying the delimiter. */
public static final String DELIMITER_FIELD = "delimiter";

/** The default delimiter value used when none is specified. Uses two consecutive newline characters to split on paragraph boundaries. */
public static final String DEFAULT_DELIMITER = "\n\n";

/** The delimiter string used for text chunking. */
private String delimiter;

/**
* Constructor that initializes the delimiter chunker with the specified parameters.
* @param parameters a map with non-runtime parameters to be parsed
*/
public DelimiterChunker(final Map<String, Object> parameters) {
parseParameters(parameters);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,22 @@
*/
public final class FixedTokenLengthChunker implements Chunker {

/** The identifier for the fixed token length chunking algorithm. */
public static final String ALGORITHM_NAME = "fixed_token_length";

// field name for each parameter
/** Field name for the analysis registry configuration parameter. */
public static final String ANALYSIS_REGISTRY_FIELD = "analysis_registry";

/** Field name for specifying the maximum number of tokens per chunk. */
public static final String TOKEN_LIMIT_FIELD = "token_limit";

/** Field name for specifying the overlap rate between consecutive chunks. */
public static final String OVERLAP_RATE_FIELD = "overlap_rate";

/** Field name for specifying the maximum token count allowed in the input text. */
public static final String MAX_TOKEN_COUNT_FIELD = "max_token_count";

/** Field name for specifying the tokenizer to be used for text analysis. */
public static final String TOKENIZER_FIELD = "tokenizer";

// default values for each non-runtime parameter
Expand All @@ -57,6 +66,10 @@ public final class FixedTokenLengthChunker implements Chunker {
private double overlapRate;
private final AnalysisRegistry analysisRegistry;

/**
* Constructor that initializes the fixed token length chunker with the specified parameters.
* @param parameters a map with non-runtime parameters to be parsed
*/
public FixedTokenLengthChunker(final Map<String, Object> parameters) {
parseParameters(parameters);
this.analysisRegistry = (AnalysisRegistry) parameters.get(ANALYSIS_REGISTRY_FIELD);
Expand Down

0 comments on commit 763fa5e

Please sign in to comment.