diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/DocumentCharacterTextSplitter.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/DocumentCharacterTextSplitter.scala index 27cc0acc8665cd..20580fc9f19be4 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/DocumentCharacterTextSplitter.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/DocumentCharacterTextSplitter.scala @@ -1,3 +1,18 @@ +/* + * Copyright 2017-2023 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package com.johnsnowlabs.nlp.annotators import com.johnsnowlabs.nlp.functions.ExplodeAnnotations @@ -272,6 +287,6 @@ class DocumentCharacterTextSplitter(override val uid: String) } /** This is the companion object of [[DocumentCharacterTextSplitter]]. Please refer to that class - * for the documentation. - */ + * for the documentation. + */ object DocumentCharacterTextSplitter extends DefaultParamsReadable[DocumentCharacterTextSplitter] diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/DocumentTokenSplitter.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/DocumentTokenSplitter.scala index 1acfd42d710bca..6499d584c79182 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/DocumentTokenSplitter.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/DocumentTokenSplitter.scala @@ -1,12 +1,27 @@ +/* + * Copyright 2017-2023 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package com.johnsnowlabs.nlp.annotators +import com.johnsnowlabs.nlp.functions.ExplodeAnnotations import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel, AnnotatorType, HasSimpleAnnotate} import org.apache.spark.ml.param.{BooleanParam, IntParam} -import org.apache.spark.ml.util.Identifiable +import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} import org.apache.spark.sql.DataFrame import scala.util.matching.Regex -import com.johnsnowlabs.nlp.functions.ExplodeAnnotations /** Annotator that splits large documents into smaller documents based on the number of tokens in * the text. @@ -223,3 +238,8 @@ class DocumentTokenSplitter(override val uid: String) if (getExplodeSplits) dataset.explodeAnnotationsCol(getOutputCol, getOutputCol) else dataset } } + +/** This is the companion object of [[DocumentTokenSplitter]]. Please refer to that class for the + * documentation. + */ +object DocumentTokenSplitter extends DefaultParamsReadable[DocumentTokenSplitter] diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/DocumentCharacterTextSplitterTest.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/DocumentCharacterTextSplitterTest.scala index b554a83ac046c7..ca7d1317c6f3f7 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/DocumentCharacterTextSplitterTest.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/DocumentCharacterTextSplitterTest.scala @@ -1,7 +1,21 @@ +/* + * Copyright 2017-2023 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package com.johnsnowlabs.nlp.annotators import com.johnsnowlabs.nlp.Annotation -import com.johnsnowlabs.nlp.annotator.DocumentCharacterTextSplitter import com.johnsnowlabs.nlp.base.DocumentAssembler import com.johnsnowlabs.nlp.util.io.ResourceHelper import com.johnsnowlabs.tags.{FastTest, SlowTest} diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/DocumentTokenSplitterTest.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/DocumentTokenSplitterTest.scala index 036205711dc924..92e54bcaf59b49 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/DocumentTokenSplitterTest.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/DocumentTokenSplitterTest.scala @@ -1,9 +1,25 @@ +/* + * Copyright 2017-2023 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package com.johnsnowlabs.nlp.annotators import com.johnsnowlabs.nlp.Annotation import com.johnsnowlabs.nlp.base.DocumentAssembler import com.johnsnowlabs.nlp.util.io.ResourceHelper import com.johnsnowlabs.tags.FastTest +import org.apache.spark.ml.Pipeline import org.apache.spark.sql.DataFrame import org.scalatest.flatspec.AnyFlatSpec @@ -55,4 +71,42 @@ class DocumentTokenSplitterTest extends AnyFlatSpec { } } + it should "be serializable" taggedAs FastTest in { + val numTokens = 3 + val textSplitter = new DocumentTokenSplitter() + .setInputCols("document") + .setOutputCol("splits") + .setNumTokens(numTokens) + .setTokenOverlap(1) + + val pipeline = new Pipeline().setStages(Array(documentAssembler, textSplitter)) + val pipelineModel = pipeline.fit(textDf) + + pipelineModel.stages.last + .asInstanceOf[DocumentTokenSplitter] + .write + .overwrite() + .save("./tmp_textSplitter") + + val loadedTextSplitModel = DocumentTokenSplitter.load("tmp_textSplitter") + + loadedTextSplitModel.transform(textDocument).select("splits").show(truncate = false) + } + + it should "be exportable to pipeline" taggedAs FastTest in { + val numTokens = 3 + val textSplitter = new DocumentTokenSplitter() + .setInputCols("document") + .setOutputCol("splits") + .setNumTokens(numTokens) + .setTokenOverlap(1) + + val pipeline = new Pipeline().setStages(Array(documentAssembler, textSplitter)) + pipeline.write.overwrite().save("tmp_textsplitter_pipe") + + val loadedPipelineModel = Pipeline.load("tmp_textsplitter_pipe") + + loadedPipelineModel.fit(textDf).transform(textDf).select("splits").show() + } + }