From 2cb48023fb7c21ce876431371feb6dca459100ae Mon Sep 17 00:00:00 2001 From: nitink Date: Wed, 23 Aug 2023 09:50:21 +0530 Subject: [PATCH] OCR_ImageDraw&DicomDraw_Regions --- docs/Gemfile.lock | 74 +- docs/en/ocr_pipeline_components.md | 102 +- ...raph_builders.AssertionTFGraphBuilder.html | 650 ------ ...lders.GenericClassifierTFGraphBuilder.html | 650 ------ ...ders.graph_builders.NerTFGraphBuilder.html | 636 ------ ...ders.RelationExtractionTFGraphBuilder.html | 645 ------ ...uilders.graph_builders.TFGraphBuilder.html | 626 ------ ....graph_builders.TFGraphBuilderFactory.html | 660 ------ ...graph_builders.TensorflowAddonsNeeded.html | 583 ------ ...uilders.graph_builders.WrongTFVersion.html | 581 ------ ...jsl._tf_graph_builders.graph_builders.html | 620 ------ ...raph_builders.AssertionTFGraphBuilder.html | 259 --- ...lders.GenericClassifierTFGraphBuilder.html | 257 --- ...s_1x.graph_builders.NerTFGraphBuilder.html | 245 --- ...ders.RelationExtractionTFGraphBuilder.html | 254 --- ...ders_1x.graph_builders.TFGraphBuilder.html | 235 --- ....graph_builders.TFGraphBuilderFactory.html | 270 --- ...ders_1x.graph_builders.WrongTFVersion.html | 192 -- ...._tf_graph_builders_1x.graph_builders.html | 226 --- ...arknlp_jsl.annotator.AnnotationMerger.html | 1044 ---------- ...nlp_jsl.annotator.AssertionDLApproach.html | 1467 -------------- ...arknlp_jsl.annotator.AssertionDLModel.html | 1118 ----------- ...rknlp_jsl.annotator.AssertionFilterer.html | 1138 ----------- ...jsl.annotator.AssertionLogRegApproach.html | 1169 ----------- ...lp_jsl.annotator.AssertionLogRegModel.html | 1136 ----------- ...rknlp_jsl.annotator.AverageEmbeddings.html | 983 --------- ...annotator.BertSentenceChunkEmbeddings.html | 1351 ------------- .../sparknlp_jsl.annotator.Chunk2Token.html | 983 --------- ...sparknlp_jsl.annotator.ChunkConverter.html | 1020 ---------- .../sparknlp_jsl.annotator.ChunkFilterer.html | 1097 ---------- ...p_jsl.annotator.ChunkFiltererApproach.html | 1189 ----------- ...sl.annotator.ChunkKeyPhraseExtraction.html | 1465 -------------- ...knlp_jsl.annotator.ChunkMergeApproach.html | 1199 ----------- ...parknlp_jsl.annotator.ChunkMergeModel.html | 1064 ---------- ...p_jsl.annotator.ChunkSentenceSplitter.html | 1086 ---------- ...lp_jsl.annotator.CommonResolverParams.html | 1016 ---------- ...sl.annotator.ContextualParserApproach.html | 1179 ----------- ...p_jsl.annotator.ContextualParserModel.html | 1122 ----------- ...sparknlp_jsl.annotator.DateNormalizer.html | 1115 ----------- ...arknlp_jsl.annotator.DeIdentification.html | 1653 --------------- ...p_jsl.annotator.DeIdentificationModel.html | 1543 -------------- ...ator.DocumentLogRegClassifierApproach.html | 1232 ------------ ...notator.DocumentLogRegClassifierModel.html | 1097 ---------- ...sparknlp_jsl.annotator.DrugNormalizer.html | 1072 ---------- ...p_jsl.annotator.EntityChunkEmbeddings.html | 1356 ------------- ...l.annotator.GenericClassifierApproach.html | 1286 ------------ ..._jsl.annotator.GenericClassifierModel.html | 1033 ---------- .../sparknlp_jsl.annotator.IOBTagger.html | 1063 ---------- ....MedicalBertForSequenceClassification.html | 1264 ------------ ...notator.MedicalBertForTokenClassifier.html | 1256 ------------ ...alDistilBertForSequenceClassification.html | 1278 ------------ ...knlp_jsl.annotator.MedicalNerApproach.html | 1764 ----------------- ...parknlp_jsl.annotator.MedicalNerModel.html | 1251 ------------ .../sparknlp_jsl.annotator.NerChunker.html | 1080 ---------- ...lp_jsl.annotator.NerConverterInternal.html | 1170 ----------- ...arknlp_jsl.annotator.NerDisambiguator.html | 1271 ------------ ...p_jsl.annotator.NerDisambiguatorModel.html | 1227 ------------ ...parknlp_jsl.annotator.PosologyREModel.html | 1078 ---------- ...rknlp_jsl.annotator.RENerChunksFilter.html | 1145 ----------- ...arknlp_jsl.annotator.ReIdentification.html | 978 --------- ....annotator.RelationExtractionApproach.html | 1364 ------------- ...l.annotator.RelationExtractionDLModel.html | 1191 ----------- ...jsl.annotator.RelationExtractionModel.html | 1203 ----------- .../sparknlp_jsl.annotator.Router.html | 1105 ----------- ...otator.SentenceEntityResolverApproach.html | 1345 ------------- ...annotator.SentenceEntityResolverModel.html | 1385 ------------- ..._jsl.annotator.SentenceResolverParams.html | 769 ------- .../_autosummary/sparknlp_jsl.annotator.html | 746 ------- .../sparknlp_jsl.base.FeaturesAssembler.html | 918 --------- .../_autosummary/sparknlp_jsl.base.html | 590 ------ ...jsl.training.AnnotationToolJsonReader.html | 642 ------ ...sparknlp_jsl.training.CantemistReader.html | 620 ------ .../sparknlp_jsl.training.CodiEspReader.html | 620 ------ .../_autosummary/sparknlp_jsl.training.html | 596 ------ .../sparknlp_jsl.training.tf_graph.html | 188 -- .../sparknlp_jsl.training.tf_graph_1x.html | 188 -- 76 files changed, 130 insertions(+), 70143 deletions(-) delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders.graph_builders.AssertionTFGraphBuilder.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders.graph_builders.GenericClassifierTFGraphBuilder.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders.graph_builders.NerTFGraphBuilder.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders.graph_builders.RelationExtractionTFGraphBuilder.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders.graph_builders.TFGraphBuilder.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders.graph_builders.TFGraphBuilderFactory.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders.graph_builders.TensorflowAddonsNeeded.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders.graph_builders.WrongTFVersion.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders.graph_builders.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders_1x.graph_builders.AssertionTFGraphBuilder.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders_1x.graph_builders.GenericClassifierTFGraphBuilder.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders_1x.graph_builders.NerTFGraphBuilder.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders_1x.graph_builders.RelationExtractionTFGraphBuilder.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders_1x.graph_builders.TFGraphBuilder.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders_1x.graph_builders.TFGraphBuilderFactory.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders_1x.graph_builders.WrongTFVersion.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders_1x.graph_builders.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.AnnotationMerger.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.AssertionDLApproach.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.AssertionDLModel.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.AssertionFilterer.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.AssertionLogRegApproach.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.AssertionLogRegModel.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.AverageEmbeddings.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.BertSentenceChunkEmbeddings.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.Chunk2Token.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.ChunkConverter.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.ChunkFilterer.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.ChunkFiltererApproach.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.ChunkKeyPhraseExtraction.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.ChunkMergeApproach.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.ChunkMergeModel.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.ChunkSentenceSplitter.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.CommonResolverParams.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.ContextualParserApproach.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.ContextualParserModel.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.DateNormalizer.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.DeIdentification.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.DeIdentificationModel.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.DocumentLogRegClassifierApproach.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.DocumentLogRegClassifierModel.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.DrugNormalizer.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.EntityChunkEmbeddings.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.GenericClassifierApproach.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.GenericClassifierModel.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.IOBTagger.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.MedicalBertForSequenceClassification.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.MedicalBertForTokenClassifier.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.MedicalDistilBertForSequenceClassification.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.MedicalNerApproach.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.MedicalNerModel.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.NerChunker.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.NerConverterInternal.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.NerDisambiguator.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.NerDisambiguatorModel.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.PosologyREModel.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.RENerChunksFilter.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.ReIdentification.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.RelationExtractionApproach.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.RelationExtractionDLModel.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.RelationExtractionModel.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.Router.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.SentenceEntityResolverApproach.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.SentenceEntityResolverModel.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.SentenceResolverParams.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.base.FeaturesAssembler.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.base.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.training.AnnotationToolJsonReader.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.training.CantemistReader.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.training.CodiEspReader.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.training.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.training.tf_graph.html delete mode 100644 docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.training.tf_graph_1x.html diff --git a/docs/Gemfile.lock b/docs/Gemfile.lock index c565b581b9..1cbf87b84f 100644 --- a/docs/Gemfile.lock +++ b/docs/Gemfile.lock @@ -7,23 +7,22 @@ PATH GEM remote: https://rubygems.org/ specs: - activesupport (6.0.6) + activesupport (7.0.7.2) concurrent-ruby (~> 1.0, >= 1.0.2) - i18n (>= 0.7, < 2) - minitest (~> 5.1) - tzinfo (~> 1.1) - zeitwerk (~> 2.2, >= 2.2.2) - addressable (2.8.1) + i18n (>= 1.6, < 2) + minitest (>= 5.1) + tzinfo (~> 2.0) + addressable (2.8.5) public_suffix (>= 2.0.2, < 6.0) coffee-script (2.4.1) coffee-script-source execjs coffee-script-source (1.11.1) colorator (1.1.0) - commonmarker (0.23.6) - concurrent-ruby (1.1.10) - dnsruby (1.61.9) - simpleidn (~> 0.1) + commonmarker (0.23.10) + concurrent-ruby (1.2.2) + dnsruby (1.70.0) + simpleidn (~> 0.2.1) elasticsearch (7.17.7) elasticsearch-api (= 7.17.7) elasticsearch-transport (= 7.17.7) @@ -35,12 +34,12 @@ GEM em-websocket (0.5.3) eventmachine (>= 0.12.9) http_parser.rb (~> 0) - ethon (0.15.0) + ethon (0.16.0) ffi (>= 1.15.0) eventmachine (1.2.7) eventmachine (1.2.7-x64-mingw32) execjs (2.8.1) - faraday (1.10.2) + faraday (1.10.3) faraday-em_http (~> 1.0) faraday-em_synchrony (~> 1.0) faraday-excon (~> 1.1) @@ -64,16 +63,14 @@ GEM faraday-rack (1.0.0) faraday-retry (1.0.3) ffi (1.15.5) - ffi (1.15.5-x64-mingw-ucrt) - ffi (1.15.5-x64-mingw32) forwardable-extended (2.6.0) gemoji (3.0.1) - github-pages (227) + github-pages (228) github-pages-health-check (= 1.17.9) - jekyll (= 3.9.2) + jekyll (= 3.9.3) jekyll-avatar (= 0.7.0) jekyll-coffeescript (= 1.1.1) - jekyll-commonmark-ghpages (= 0.2.0) + jekyll-commonmark-ghpages (= 0.4.0) jekyll-default-layout (= 0.1.4) jekyll-feed (= 0.15.1) jekyll-gist (= 1.5.0) @@ -107,7 +104,7 @@ GEM jemoji (= 0.12.0) kramdown (= 2.3.2) kramdown-parser-gfm (= 1.1.0) - liquid (= 4.0.3) + liquid (= 4.0.4) mercenary (~> 0.3) minima (= 2.5.1) nokogiri (>= 1.13.6, < 2.0) @@ -123,13 +120,13 @@ GEM activesupport (>= 2) nokogiri (>= 1.4) http_parser.rb (0.8.0) - i18n (0.9.5) + i18n (1.14.1) concurrent-ruby (~> 1.0) - jekyll (3.9.2) + jekyll (3.9.3) addressable (~> 2.4) colorator (~> 1.0) em-websocket (~> 0.5) - i18n (~> 0.7) + i18n (>= 0.7, < 2) jekyll-sass-converter (~> 1.0) jekyll-watch (~> 2.0) kramdown (>= 1.17, < 3) @@ -145,11 +142,11 @@ GEM coffee-script-source (~> 1.11.1) jekyll-commonmark (1.4.0) commonmarker (~> 0.22) - jekyll-commonmark-ghpages (0.2.0) - commonmarker (~> 0.23.4) + jekyll-commonmark-ghpages (0.4.0) + commonmarker (~> 0.23.7) jekyll (~> 3.9.0) jekyll-commonmark (~> 1.4.0) - rouge (>= 2.0, < 4.0) + rouge (>= 2.0, < 5.0) jekyll-default-layout (0.1.4) jekyll (~> 3.0) jekyll-feed (0.15.1) @@ -237,21 +234,21 @@ GEM rexml kramdown-parser-gfm (1.1.0) kramdown (~> 2.0) - liquid (4.0.3) - listen (3.7.1) + liquid (4.0.4) + listen (3.8.0) rb-fsevent (~> 0.10, >= 0.10.3) rb-inotify (~> 0.9, >= 0.9.10) mercenary (0.3.6) - mini_portile2 (2.8.0) + mini_portile2 (2.8.4) minima (2.5.1) jekyll (>= 3.5, < 5.0) jekyll-feed (~> 0.9) jekyll-seo-tag (~> 2.1) - minitest (5.16.3) + minitest (5.19.0) multi_json (1.15.0) - multipart-post (2.2.3) - nokogiri (1.13.9) - mini_portile2 (~> 2.8.0) + multipart-post (2.3.0) + nokogiri (1.15.4) + mini_portile2 (~> 2.8.2) racc (~> 1.4) octokit (4.25.1) faraday (>= 1, < 3) @@ -259,11 +256,11 @@ GEM pathutil (0.16.2) forwardable-extended (~> 2.6) public_suffix (4.0.7) - racc (1.6.0) + racc (1.7.1) rb-fsevent (0.11.2) rb-inotify (0.10.1) ffi (~> 1.0) - rexml (3.2.5) + rexml (3.2.6) rouge (3.26.0) ruby2_keywords (0.0.5) rubyzip (2.3.2) @@ -280,20 +277,19 @@ GEM unf (~> 0.1.4) terminal-table (1.8.0) unicode-display_width (~> 1.1, >= 1.1.1) - thread_safe (0.3.6) typhoeus (1.4.0) ethon (>= 0.9.0) - tzinfo (1.2.10) - thread_safe (~> 0.1) + tzinfo (2.0.6) + concurrent-ruby (~> 1.0) unf (0.1.4) unf_ext unf_ext (0.0.8.2) unicode-display_width (1.8.0) wdm (0.1.1) - webrick (1.7.0) - zeitwerk (2.6.1) + webrick (1.8.1) PLATFORMS + arm64-darwin-22 x64-mingw-ucrt x64-mingw32 x86_64-darwin-21 @@ -302,7 +298,7 @@ PLATFORMS DEPENDENCIES elasticsearch (~> 7.10) - github-pages (= 227) + github-pages (= 228) jekyll (~> 3.9) jekyll-incremental (= 0.1.0)! jekyll-redirect-from diff --git a/docs/en/ocr_pipeline_components.md b/docs/en/ocr_pipeline_components.md index 913c754df5..6ee254374c 100644 --- a/docs/en/ocr_pipeline_components.md +++ b/docs/en/ocr_pipeline_components.md @@ -1356,6 +1356,89 @@ data.select("dicom").show()
+### DicomDrawRegions + +`DicomDrawRegions` draw regions to Dicom Image. + +
+ +##### Input Columns + +{:.table-model-big} +| Param name | Type | Default | Column Data Description | +| --- | --- | --- | --- | +| inputCol | string | content | Binary dicom object | +| inputRegionsCol | string | regions | Detected Array[Coordinates] from PositionFinder | + +
+ +#### Parameters + +{:.table-model-big} +| Param name | Type | Default | Description | +| --- | --- | --- | --- | +| scaleFactor | float | 1.0 | Scaling factor for regions. | +| rotated | boolean | False | Enable/Disable support for rotated rectangles | +| keepInput | boolean | False | Keep the original input column | +| compression | string | RLELossless | Compression type | +| forceCompress | boolean | False | True - Force compress image. False - compress only if original image was compressed | +| aggCols | Array[string] | ['path'] | Sets the columns to be included in aggregation. These columns are preserved in the output DataFrame after transformations | + +
+ +##### Output Columns + +{:.table-model-big} +| Param name | Type | Default | Column Data Description | +| --- | --- | --- | --- | +| outputCol | string | image | Modified Dicom file data | + +**Example:** + +
+ +{% include programmingLanguageSelectScalaPython.html %} + +```python +from sparkocr.transformers import * + +dicomPath = "path to dicom files" + +# Read dicom file as binary file +df = spark.read.format("binaryFile").load(dicomPath) + +dicomToImage = DicomToImage() \ + .setInputCol("content") \ + .setOutputCol("image") \ + .setMetadataCol("meta") + +position_finder = PositionFinder() \ + # Usually chunks are created using the deidentification_nlp_pipeline + .setInputCols("ner_chunk") \ + .setOutputCol("coordinates") \ + .setPageMatrixCol("positions") \ + .setPadding(0) + +draw_regions = DicomDrawRegions() \ + .setInputCol("content") \ + .setInputRegionsCol("coordinates") \ + .setOutputCol("dicom") \ + .setKeepInput(True) \ + .setScaleFactor(1/3.0) \ + .setAggCols(["path", "content"]) + +data = dicomToImage.transform(df) + +data.select("content", "dicom").show() +``` + +```scala +// Note: DicomDrawRegions class is not available in the Scala API +// This class is used in the Python API for DICOM image manipulation and transformation. +``` + +
+ ## Image pre-processing Next section describes the transformers for image pre-processing: scaling, binarization, skew correction, etc. @@ -2896,6 +2979,11 @@ val result = modelPipeline.transform(df) | lineWidth | Int | 4 | Line width for draw rectangles | | fontSize | Int | 12 | Font size for render labels and score | | rotated | boolean | False | Support rotated regions | +| rectColor | Color | Color.black | Color outline for bounding box | +| filledRect | boolean | False | Enable/Disable filling rectangle | +| sourceImageHeightCol | Int | height_dimension | Original annotation reference height | +| sourceImageWidthCol | Int | width_dimension | Original annotation reference width | +| scaleBoundingBoxes | Boolean | True | sourceImage height & width are required for scaling. Necessary to ensure accurate regions despite image transformations.|
@@ -2915,13 +3003,12 @@ val result = modelPipeline.transform(df) ```python from pyspark.ml import PipelineModel from sparkocr.transformers import * +from sparkocr.enums import * imagePath = "path to image" # Read image file as binary file -df = spark.read - .format("binaryFile") - .load(imagePath) +df = spark.read.format("binaryFile").load(imagePath) binary_to_image = BinaryToImage() \ .setInputCol("content") \ @@ -2935,6 +3022,7 @@ layout_analyzer = ImageLayoutAnalyzer() \ draw = ImageDrawRegions() \ .setInputCol("image") \ .setRegionCol("regions") \ + .setRectColor(Color.red) \ .setOutputCol("image_with_regions") # Define pipeline @@ -2950,17 +3038,16 @@ data.show() ```scala import org.apache.spark.ml.Pipeline +import java.awt.Color import com.johnsnowlabs.ocr.transformers.{ImageSplitRegions, ImageLayoutAnalyzer} import com.johnsnowlabs.ocr.OcrContext.implicits._ + val imagePath = "path to image" // Read image file as binary file -val df = spark.read - .format("binaryFile") - .load(imagePath) - .asImage("image") +val df = spark.read.format("binaryFile").load(imagePath).asImage("image") // Define transformer for detect regions val layoutAnalyzer = new ImageLayoutAnalyzer() @@ -2970,6 +3057,7 @@ val layoutAnalyzer = new ImageLayoutAnalyzer() val draw = new ImageDrawRegions() .setInputCol("image") .setRegionCol("regions") + .setRectColor(Color.RED) .setOutputCol("image_with_regions") // Define pipeline diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders.graph_builders.AssertionTFGraphBuilder.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders.graph_builders.AssertionTFGraphBuilder.html deleted file mode 100644 index 0b352b8a4a..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders.graph_builders.AssertionTFGraphBuilder.html +++ /dev/null @@ -1,650 +0,0 @@ - - - - - - - - sparknlp_jsl._tf_graph_builders.graph_builders.AssertionTFGraphBuilder — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl._tf_graph_builders.graph_builders.AssertionTFGraphBuilder

-
-
-class sparknlp_jsl._tf_graph_builders.graph_builders.AssertionTFGraphBuilder(build_params)[source]
-

Bases: sparknlp_jsl._tf_graph_builders.graph_builders.TFGraphBuilder

-

Class to build the the TF graphs for AssertionDLApproach

-

Examples

-
>>> from sparknlp_jsl.training import tf_graph
->>> from sparknlp_jsl.base import *
->>> from sparknlp.annotator import *
->>> from sparknlp_jsl.annotator import *
->>> from sparknlp_jsl.annotator import *
->>>feat_size = 200
->>>n_classes = 6
->>> tf_graph.build("assertion_dl",build_params={"n_classes": n_classes}, model_location= "./tf_graphs", model_filename="blstm_34_32_30_{}_{}.pb".format(feat_size, n_classes))
->>> assertion = AssertionDLApproach() \
->>>               .setLabelCol("label") \
->>>               .setInputCols("document", "chunk", "embeddings") \
->>>               .setOutputCol("assertion") \
->>>               .setBatchSize(128) \
->>>               .setDropout(0.1) \
->>>               .setLearningRate(0.001) \
->>>               .setEpochs(50) \
->>>               .setValidationSplit(0.2) \
->>>               .setStartCol("start") \
->>>               .setEndCol("end") \
->>>               .setMaxSentLen(250) \
->>>               .setEnableOutputLogs(True) \
->>>               .setOutputLogsPath('training_logs/') \
->>>               .setGraphFolder('tf_graphs')
-
-
-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__(build_params)

build(model_location, model_filename)

check_build_params()

get_build_param(build_param)

get_build_params()

get_build_params_with_defaults()

get_model_build_param_explanations()

get_model_build_params()

get_model_filename()

supports_auto_file_name()

-
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders.graph_builders.GenericClassifierTFGraphBuilder.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders.graph_builders.GenericClassifierTFGraphBuilder.html deleted file mode 100644 index b3d99d3487..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders.graph_builders.GenericClassifierTFGraphBuilder.html +++ /dev/null @@ -1,650 +0,0 @@ - - - - - - - - sparknlp_jsl._tf_graph_builders.graph_builders.GenericClassifierTFGraphBuilder — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl._tf_graph_builders.graph_builders.GenericClassifierTFGraphBuilder

-
-
-class sparknlp_jsl._tf_graph_builders.graph_builders.GenericClassifierTFGraphBuilder(build_params)[source]
-

Bases: sparknlp_jsl._tf_graph_builders.graph_builders.TFGraphBuilder

-

Class to create the the TF graphs for GenericClassifierApproach

-

Examples

-
>>> from sparknlp_jsl.training import tf_graph
->>> from sparknlp_jsl.base import *
->>> from sparknlp.annotator import *
->>> from sparknlp_jsl.annotator import *
->>> from sparknlp_jsl.annotator import *
->>> dataframe = pd.read_csv('petfinder-mini.csv')
->>> DL_params = {"input_dim": 302,"output_dim": 2,"hidden_layers": [300, 200, 100], "hidden_act": "tanh",'hidden_act_l2':1,'batch_norm':1}
->>> tf_graph.build("generic_classifier",build_params=DL_params, model_location="/content/gc_graph", model_filename="auto")
->>> gen_clf = GenericClassifierApproach() \
-...    .setLabelColumn("target") \
-...    .setInputCols(["features"]) \
-...    .setOutputCol("prediction") \
-...    .setModelFile('/content/gc_graph/gcl.302.2.pb') \
-...    .setEpochsNumber(50) \
-...    .setBatchSize(100) \
-...    .setFeatureScaling("zscore") \
-...    .setFixImbalance(True) \
-...    .setLearningRate(0.001) \
-...    .setOutputLogsPath("logs") \
-...    .setValidationSplit(0.2)
-
-
-
>>> clf_Pipeline = Pipeline(stages=[features_asm,gen_clf])
-
-
-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__(build_params)

build(model_location, model_filename)

check_build_params()

get_build_param(build_param)

get_build_params()

get_build_params_with_defaults()

get_model_build_param_explanations()

get_model_build_params()

get_model_filename()

supports_auto_file_name()

-
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders.graph_builders.NerTFGraphBuilder.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders.graph_builders.NerTFGraphBuilder.html deleted file mode 100644 index 047ae3fc1a..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders.graph_builders.NerTFGraphBuilder.html +++ /dev/null @@ -1,636 +0,0 @@ - - - - - - - - sparknlp_jsl._tf_graph_builders.graph_builders.NerTFGraphBuilder — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl._tf_graph_builders.graph_builders.NerTFGraphBuilder

-
-
-class sparknlp_jsl._tf_graph_builders.graph_builders.NerTFGraphBuilder(build_params)[source]
-

Bases: sparknlp_jsl._tf_graph_builders.graph_builders.TFGraphBuilder

-

Class to build the the TF graphs for MedicalNerApproach.

-

Examples

-
>>> from sparknlp_jsl.training import tf_graph
->>> from sparknlp_jsl.base import *
->>> from sparknlp.annotator import *
->>> from sparknlp_jsl.annotator import *
->>> from sparknlp_jsl.annotator import *
->>>feat_size = 200
->>>n_classes = 6
->>> tf_graph.build("ner_dl", build_params={"embeddings_dim": 200, "nchars": 83,"ntags": 12,"is_medical": 1},model_location="./medical_ner_graphs",model_filename="auto")
->>> nerTagger = MedicalNerApproach()    >>>                     .setInputCols(["sentence", "token", "embeddings"])    >>>                     .setLabelColumn("label")    >>>                     .setOutputCol("ner")    >>>                     .setMaxEpochs(2)    >>>                     .setBatchSize(64)    >>>                     .setRandomSeed(0)    >>>                     .setVerbose(1)    >>>                     .setValidationSplit(0.2)    >>>                     .setEvaluationLogExtended(True)     >>>                     .setEnableOutputLogs(True)    >>>                     .setIncludeConfidence(True)    >>>                     .setOutputLogsPath('ner_logs')    >>>                     .setGraphFolder('medical_ner_graphs')    >>>                     .setEnableMemoryOptimizer(True)
-
-
-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__(build_params)

build(model_location, model_filename)

check_build_params()

get_build_param(build_param)

get_build_params()

get_build_params_with_defaults()

get_model_build_param_explanations()

get_model_build_params()

get_model_filename()

supports_auto_file_name()

-
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders.graph_builders.RelationExtractionTFGraphBuilder.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders.graph_builders.RelationExtractionTFGraphBuilder.html deleted file mode 100644 index 3525e029d6..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders.graph_builders.RelationExtractionTFGraphBuilder.html +++ /dev/null @@ -1,645 +0,0 @@ - - - - - - - - sparknlp_jsl._tf_graph_builders.graph_builders.RelationExtractionTFGraphBuilder — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl._tf_graph_builders.graph_builders.RelationExtractionTFGraphBuilder

-
-
-class sparknlp_jsl._tf_graph_builders.graph_builders.RelationExtractionTFGraphBuilder(build_params)[source]
-

Bases: sparknlp_jsl._tf_graph_builders.graph_builders.GenericClassifierTFGraphBuilder

-

Class to build the the TF graphs for RelationExtractionApproach

-

Examples

-
>>> from sparknlp_jsl.training import tf_graph
->>> from sparknlp_jsl.base import *
->>> from sparknlp.annotator import *
->>> from sparknlp_jsl.annotator import *
->>> from sparknlp_jsl.annotator import *
->>> tf_graph.build("relation_extraction", build_params={"input_dim": 6000, "output_dim": 3, 'batch_norm':1, "hidden_layers": [300, 200], "hidden_act": "relu", 'hidden_act_l2':1}, model_location=".", model_filename="re_with_BN")
->>> re_approach = RelationExtractionApproach() \
-...    .setLabelColumn("rel") \
-...    .setInputCols(["embeddings", "pos_tags", "train_ner_chunks", "dependencies"]) \
-...    .setOutputCol("relations") \
-...    .setModelFile('./re_with_BN') \
-...    .setEpochsNumber(70) \
-...    .setBatchSize(200) \
-...    .setFixImbalance(True) \
-...    .setLearningRate(0.001) \
-...    .setFromEntity("begin1i", "end1i", "label1") \
-...    .setToEntity("begin2i", "end2i", "label2") \
-...    .setValidationSplit(0.2)
-
-
-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__(build_params)

build(model_location, model_filename)

check_build_params()

get_build_param(build_param)

get_build_params()

get_build_params_with_defaults()

get_model_build_param_explanations()

get_model_build_params()

get_model_filename()

supports_auto_file_name()

-
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders.graph_builders.TFGraphBuilder.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders.graph_builders.TFGraphBuilder.html deleted file mode 100644 index 5b81109db4..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders.graph_builders.TFGraphBuilder.html +++ /dev/null @@ -1,626 +0,0 @@ - - - - - - - - sparknlp_jsl._tf_graph_builders.graph_builders.TFGraphBuilder — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl._tf_graph_builders.graph_builders.TFGraphBuilder

-
-
-class sparknlp_jsl._tf_graph_builders.graph_builders.TFGraphBuilder(build_params)[source]
-

Bases: object

-

Generic class to create the tensorflow graphs for ‘ner_dl’, ‘generic_classifier’, ‘assertion_dl’, ‘relation_extraction’ annotators in spark-nlp healthcare. In version 1.1 -Examples -——– ->>> from sparknlp_jsl.training import tf_graph ->>> ->>> tf_graph.get_models()

-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__(build_params)

check_build_params()

get_build_param(build_param)

get_build_params()

get_build_params_with_defaults()

get_model_build_param_explanations()

get_model_build_params()

get_model_filename()

supports_auto_file_name()

-
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders.graph_builders.TFGraphBuilderFactory.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders.graph_builders.TFGraphBuilderFactory.html deleted file mode 100644 index 9d54984723..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders.graph_builders.TFGraphBuilderFactory.html +++ /dev/null @@ -1,660 +0,0 @@ - - - - - - - - sparknlp_jsl._tf_graph_builders.graph_builders.TFGraphBuilderFactory — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl._tf_graph_builders.graph_builders.TFGraphBuilderFactory

-
-
-class sparknlp_jsl._tf_graph_builders.graph_builders.TFGraphBuilderFactory[source]
-

Bases: object

-

Factory class to create the the different tensorflow graphs for ner_dl, generic_classifier, assertion_dl, relation_extraction annotators in spark-nlp healthcare

-

Methods

- ---- - - - - - - - - - - - - - - -

__init__(*args, **kwargs)

build(model_name, build_params, model_location)

Method that create the tf graph.

get_models()

Method that return the available tf models in spark-nlp healthcare Examples -------- >>> from sparknlp_jsl.training import tf_graph >>> tf_graph.get_models()

print_model_params(model_name)

Method that return the params allowed for the tf model.This method return the params with the description for every param.

-
-
-static build(model_name, build_params, model_location, model_filename='auto')[source]
-

Method that create the tf graph.

-
-
Parameters
-
-
model_name: str

The name of the tf model that you want to build.Model availables ner_dl,generic_classifier,assertion_dl and relation_extraction

-
-
build_params: dict

Configuration params to build the tf graph for the specific model.

-
-
model_location: str

Path where the model will be saved

-
-
model_filename: str

Name of the .rb file. If you put auto the filename will be generated.

-
-
-
-
-

Examples

-
>>> from sparknlp_jsl.training import tf_graph
->>> tf_graph.build("assertion_dl",build_params={"n_classes": 10}, model_location="/tmp", model_filename="assertion_dl.pb")
-
-
-
- -
-
-static get_models()[source]
-

Method that return the available tf models in spark-nlp healthcare -Examples -——– ->>> from sparknlp_jsl.training import tf_graph ->>> tf_graph.get_models()

-
- -
-
-static print_model_params(model_name)[source]
-

Method that return the params allowed for the tf model.This method return the params with the description for every param.

-
-
Parameters
-
-
model_name: str

The name of the tf model name.Model availables ner_dl,generic_classifier,assertion_dl and relation_extraction

-
-
-
-
-

Examples

-
>>> from sparknlp_jsl.training import tf_graph
->>> tf_graph.print_model_params("assertion_dl")
-
-
-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders.graph_builders.TensorflowAddonsNeeded.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders.graph_builders.TensorflowAddonsNeeded.html deleted file mode 100644 index 19214050f0..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders.graph_builders.TensorflowAddonsNeeded.html +++ /dev/null @@ -1,583 +0,0 @@ - - - - - - - - sparknlp_jsl._tf_graph_builders.graph_builders.TensorflowAddonsNeeded — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl._tf_graph_builders.graph_builders.TensorflowAddonsNeeded

-
-
-exception sparknlp_jsl._tf_graph_builders.graph_builders.TensorflowAddonsNeeded[source]
-
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders.graph_builders.WrongTFVersion.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders.graph_builders.WrongTFVersion.html deleted file mode 100644 index 187dcd2d69..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders.graph_builders.WrongTFVersion.html +++ /dev/null @@ -1,581 +0,0 @@ - - - - - - - - sparknlp_jsl._tf_graph_builders.graph_builders.WrongTFVersion — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl._tf_graph_builders.graph_builders.WrongTFVersion

-
-
-exception sparknlp_jsl._tf_graph_builders.graph_builders.WrongTFVersion[source]
-
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders.graph_builders.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders.graph_builders.html deleted file mode 100644 index 2af68ed060..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders.graph_builders.html +++ /dev/null @@ -1,620 +0,0 @@ - - - - - - - - sparknlp_jsl._tf_graph_builders.graph_builders — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl._tf_graph_builders.graph_builders

-

Classes

- ---- - - - - - - - - - - - - - - - - - - - - -

AssertionTFGraphBuilder

Class to build the the TF graphs for AssertionDLApproach

GenericClassifierTFGraphBuilder

Class to create the the TF graphs for GenericClassifierApproach

NerTFGraphBuilder

Class to build the the TF graphs for MedicalNerApproach.

RelationExtractionTFGraphBuilder

Class to build the the TF graphs for RelationExtractionApproach

TFGraphBuilder

Generic class to create the tensorflow graphs for 'ner_dl', 'generic_classifier', 'assertion_dl', 'relation_extraction' annotators in spark-nlp healthcare.

TFGraphBuilderFactory

Factory class to create the the different tensorflow graphs for ner_dl, generic_classifier, assertion_dl, relation_extraction annotators in spark-nlp healthcare

-

Exceptions

- ---- - - - - - - - - -

TensorflowAddonsNeeded

WrongTFVersion

-
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders_1x.graph_builders.AssertionTFGraphBuilder.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders_1x.graph_builders.AssertionTFGraphBuilder.html deleted file mode 100644 index a645f6dc77..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders_1x.graph_builders.AssertionTFGraphBuilder.html +++ /dev/null @@ -1,259 +0,0 @@ - - - - - - - - sparknlp_jsl._tf_graph_builders_1x.graph_builders.AssertionTFGraphBuilder — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl._tf_graph_builders_1x.graph_builders.AssertionTFGraphBuilder

-
-
-class sparknlp_jsl._tf_graph_builders_1x.graph_builders.AssertionTFGraphBuilder(build_params)[source]
-

Bases: sparknlp_jsl._tf_graph_builders_1x.graph_builders.TFGraphBuilder

-

Class to build the the TF graphs for AssertionDLApproach

-

Examples

-
>>> from sparknlp_jsl.training import tf_graph_1x
->>> from sparknlp_jsl.base import *
->>> from sparknlp.annotator import *
->>> from sparknlp_jsl.annotator import *
->>> from sparknlp_jsl.annotator import *
->>>feat_size = 200
->>>n_classes = 6
->>> tf_graph_1x.build("assertion_dl",build_params={"n_classes": n_classes}, model_location= "./tf_graphs", model_filename="blstm_34_32_30_{}_{}.pb".format(feat_size, n_classes))
->>> assertion = AssertionDLApproach() \
->>>               .setLabelCol("label") \
->>>               .setInputCols("document", "chunk", "embeddings") \
->>>               .setOutputCol("assertion") \
->>>               .setBatchSize(128) \
->>>               .setDropout(0.1) \
->>>               .setLearningRate(0.001) \
->>>               .setEpochs(50) \
->>>               .setValidationSplit(0.2) \
->>>               .setStartCol("start") \
->>>               .setEndCol("end") \
->>>               .setMaxSentLen(250) \
->>>               .setEnableOutputLogs(True) \
->>>               .setOutputLogsPath('training_logs/') \
->>>               .setGraphFolder('tf_graphs')
-
-
-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__(build_params)

build(model_location, model_filename)

check_build_params()

get_build_param(build_param)

get_build_params()

get_build_params_with_defaults()

get_model_build_param_explanations()

get_model_build_params()

get_model_filename()

supports_auto_file_name()

-
- -
- - -
- - -
- - -
- -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders_1x.graph_builders.GenericClassifierTFGraphBuilder.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders_1x.graph_builders.GenericClassifierTFGraphBuilder.html deleted file mode 100644 index d0cf258ff6..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders_1x.graph_builders.GenericClassifierTFGraphBuilder.html +++ /dev/null @@ -1,257 +0,0 @@ - - - - - - - - sparknlp_jsl._tf_graph_builders_1x.graph_builders.GenericClassifierTFGraphBuilder — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl._tf_graph_builders_1x.graph_builders.GenericClassifierTFGraphBuilder

-
-
-class sparknlp_jsl._tf_graph_builders_1x.graph_builders.GenericClassifierTFGraphBuilder(build_params)[source]
-

Bases: sparknlp_jsl._tf_graph_builders_1x.graph_builders.TFGraphBuilder

-

Class to create the the TF graphs for GenericClassifierApproach

-

Examples

-
>>> from sparknlp_jsl.training import tf_graph
->>> from sparknlp_jsl.base import *
->>> from sparknlp.annotator import *
->>> from sparknlp_jsl.annotator import *
->>> from sparknlp_jsl.annotator import *
->>> dataframe = pd.read_csv('petfinder-mini.csv')
->>> DL_params = {"input_dim": 302,"output_dim": 2,"hidden_layers": [300, 200, 100], "hidden_act": "tanh",'hidden_act_l2':1,'batch_norm':1}
->>> tf_graph.build("generic_classifier",build_params=DL_params, model_location="/content/gc_graph", model_filename="auto")
->>> gen_clf = GenericClassifierApproach() \
-...    .setLabelColumn("target") \
-...    .setInputCols(["features"]) \
-...    .setOutputCol("prediction") \
-...    .setModelFile('/content/gc_graph/gcl.302.2.pb') \
-...    .setEpochsNumber(50) \
-...    .setBatchSize(100) \
-...    .setFeatureScaling("zscore") \
-...    .setFixImbalance(True) \
-...    .setLearningRate(0.001) \
-...    .setOutputLogsPath("logs") \
-...    .setValidationSplit(0.2)
->>> clf_Pipeline = Pipeline(stages=[features_asm,gen_clf])
-
-
-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__(build_params)

build(model_location, model_filename)

check_build_params()

get_build_param(build_param)

get_build_params()

get_build_params_with_defaults()

get_model_build_param_explanations()

get_model_build_params()

get_model_filename()

supports_auto_file_name()

-
- -
- - -
- - -
- - -
- -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders_1x.graph_builders.NerTFGraphBuilder.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders_1x.graph_builders.NerTFGraphBuilder.html deleted file mode 100644 index 0bab574412..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders_1x.graph_builders.NerTFGraphBuilder.html +++ /dev/null @@ -1,245 +0,0 @@ - - - - - - - - sparknlp_jsl._tf_graph_builders_1x.graph_builders.NerTFGraphBuilder — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl._tf_graph_builders_1x.graph_builders.NerTFGraphBuilder

-
-
-class sparknlp_jsl._tf_graph_builders_1x.graph_builders.NerTFGraphBuilder(build_params)[source]
-

Bases: sparknlp_jsl._tf_graph_builders_1x.graph_builders.TFGraphBuilder

-

Class to build the the TF graphs for MedicalNerApproach.

-

Examples

-
>>> from sparknlp_jsl.training import tf_graph_1x
->>> from sparknlp_jsl.base import *
->>> from sparknlp.annotator import *
->>> from sparknlp_jsl.annotator import *
->>> from sparknlp_jsl.annotator import *
->>>feat_size = 200
->>>n_classes = 6
->>> tf_graph_1x.build("ner_dl", build_params={"embeddings_dim": 200, "nchars": 83,"ntags": 12,"is_medical": 1},model_location="./medical_ner_graphs",model_filename="auto")
->>> nerTagger = MedicalNerApproach()    >>>                     .setInputCols(["sentence", "token", "embeddings"])    >>>                     .setLabelColumn("label")    >>>                     .setOutputCol("ner")    >>>                     .setMaxEpochs(2)    >>>                     .setBatchSize(64)    >>>                     .setRandomSeed(0)    >>>                     .setVerbose(1)    >>>                     .setValidationSplit(0.2)    >>>                     .setEvaluationLogExtended(True)     >>>                     .setEnableOutputLogs(True)    >>>                     .setIncludeConfidence(True)    >>>                     .setOutputLogsPath('ner_logs')    >>>                     .setGraphFolder('medical_ner_graphs')    >>>                     .setEnableMemoryOptimizer(True)
-
-
-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__(build_params)

build(model_location, model_filename)

check_build_params()

get_build_param(build_param)

get_build_params()

get_build_params_with_defaults()

get_model_build_param_explanations()

get_model_build_params()

get_model_filename()

supports_auto_file_name()

-
- -
- - -
- - -
- - -
- -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders_1x.graph_builders.RelationExtractionTFGraphBuilder.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders_1x.graph_builders.RelationExtractionTFGraphBuilder.html deleted file mode 100644 index 4d5cc11ec9..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders_1x.graph_builders.RelationExtractionTFGraphBuilder.html +++ /dev/null @@ -1,254 +0,0 @@ - - - - - - - - sparknlp_jsl._tf_graph_builders_1x.graph_builders.RelationExtractionTFGraphBuilder — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl._tf_graph_builders_1x.graph_builders.RelationExtractionTFGraphBuilder

-
-
-class sparknlp_jsl._tf_graph_builders_1x.graph_builders.RelationExtractionTFGraphBuilder(build_params)[source]
-

Bases: sparknlp_jsl._tf_graph_builders_1x.graph_builders.GenericClassifierTFGraphBuilder

-

Class to build the the TF graphs for RelationExtractionApproach

-

Examples

-
>>> from sparknlp_jsl.training import tf_graph_1x
->>> from sparknlp_jsl.base import *
->>> from sparknlp.annotator import *
->>> from sparknlp_jsl.annotator import *
->>> from sparknlp_jsl.annotator import *
->>> tf_graph_1x.build("relation_extraction", build_params={"input_dim": 6000, "output_dim": 3, 'batch_norm':1, "hidden_layers": [300, 200], "hidden_act": "relu", 'hidden_act_l2':1}, model_location=".", model_filename="re_with_BN")
->>> re_approach = RelationExtractionApproach() \
-...    .setLabelColumn("rel") \
-...    .setInputCols(["embeddings", "pos_tags", "train_ner_chunks", "dependencies"]) \
-...    .setOutputCol("relations") \
-...    .setModelFile('./re_with_BN') \
-...    .setEpochsNumber(70) \
-...    .setBatchSize(200) \
-...    .setFixImbalance(True) \
-...    .setLearningRate(0.001) \
-...    .setFromEntity("begin1i", "end1i", "label1") \
-...    .setToEntity("begin2i", "end2i", "label2") \
-...    .setValidationSplit(0.2)
-
-
-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__(build_params)

build(model_location, model_filename)

check_build_params()

get_build_param(build_param)

get_build_params()

get_build_params_with_defaults()

get_model_build_param_explanations()

get_model_build_params()

get_model_filename()

supports_auto_file_name()

-
- -
- - -
- - -
- - -
- -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders_1x.graph_builders.TFGraphBuilder.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders_1x.graph_builders.TFGraphBuilder.html deleted file mode 100644 index 1771232948..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders_1x.graph_builders.TFGraphBuilder.html +++ /dev/null @@ -1,235 +0,0 @@ - - - - - - - - sparknlp_jsl._tf_graph_builders_1x.graph_builders.TFGraphBuilder — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl._tf_graph_builders_1x.graph_builders.TFGraphBuilder

-
-
-class sparknlp_jsl._tf_graph_builders_1x.graph_builders.TFGraphBuilder(build_params)[source]
-

Bases: object

-

Generic class to create the tensorflow graphs for ‘ner_dl’, ‘generic_classifier’, ‘assertion_dl’, ‘relation_extraction’ annotators in spark-nlp healthcare

-

Examples

-
>>> from sparknlp_jsl.training import tf_graph_1x
->>> tf_graph_1x.get_models()
-
-
-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__(build_params)

check_build_params()

get_build_param(build_param)

get_build_params()

get_build_params_with_defaults()

get_model_build_param_explanations()

get_model_build_params()

get_model_filename()

supports_auto_file_name()

-
- -
- - -
- - -
- - -
- -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders_1x.graph_builders.TFGraphBuilderFactory.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders_1x.graph_builders.TFGraphBuilderFactory.html deleted file mode 100644 index 5c6d0b1407..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders_1x.graph_builders.TFGraphBuilderFactory.html +++ /dev/null @@ -1,270 +0,0 @@ - - - - - - - - sparknlp_jsl._tf_graph_builders_1x.graph_builders.TFGraphBuilderFactory — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl._tf_graph_builders_1x.graph_builders.TFGraphBuilderFactory

-
-
-class sparknlp_jsl._tf_graph_builders_1x.graph_builders.TFGraphBuilderFactory[source]
-

Bases: object

-

Factory class to create the the different tensorflow graphs for ner_dl, generic_classifier, assertion_dl, relation_extraction annotators in spark-nlp healthcare

-

Methods

- ---- - - - - - - - - - - - - - - -

__init__(*args, **kwargs)

build(model_name, build_params, model_location)

Method that create the tf graph.

get_models()

Method that return the available tf models in spark-nlp healthcare

print_model_params(model_name)

Method that return the params allowed for the tf model.This method return the params with the description for every param.

-
-
-static build(model_name, build_params, model_location, model_filename='auto')[source]
-

Method that create the tf graph.

-
-
Parameters
-
-
model_name: str

The name of the tf model that you want to build.Model availables ner_dl,generic_classifier,assertion_dl and relation_extraction

-
-
build_params: dict

Configuration params to build the tf graph for the specific model.

-
-
model_location: str

Path where the model will be saved

-
-
model_filename: str

Name of the .rb file. If you put auto the filename will be generated.

-
-
-
-
-

Examples

-
>>> from sparknlp_jsl.training import tf_graph
->>> tf_graph.build("assertion_dl",build_params={"n_classes": 10}, model_location="/tmp", model_filename="assertion_dl.pb")
-
-
-
- -
-
-static get_models()[source]
-

Method that return the available tf models in spark-nlp healthcare

-

Examples

-
>>> from sparknlp_jsl.training import tf_graph_1x
->>> tf_graph_1x.get_models()
-
-
-
- -
-
-static print_model_params(model_name)[source]
-

Method that return the params allowed for the tf model.This method return the params with the description for every param.

-
-
Parameters
-
-
model_name: str

The name of the tf model name.Model availables ner_dl,generic_classifier,assertion_dl and relation_extraction

-
-
-
-
-

Examples

-
>>> from sparknlp_jsl.training import tf_graph
->>> tf_graph.print_model_params("assertion_dl")
-
-
-
- -
- -
- - -
- - -
- - -
- -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders_1x.graph_builders.WrongTFVersion.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders_1x.graph_builders.WrongTFVersion.html deleted file mode 100644 index df162a30e3..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders_1x.graph_builders.WrongTFVersion.html +++ /dev/null @@ -1,192 +0,0 @@ - - - - - - - - sparknlp_jsl._tf_graph_builders_1x.graph_builders.WrongTFVersion — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl._tf_graph_builders_1x.graph_builders.WrongTFVersion

-
-
-exception sparknlp_jsl._tf_graph_builders_1x.graph_builders.WrongTFVersion[source]
-
- -
- - -
- - -
- - -
- -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders_1x.graph_builders.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders_1x.graph_builders.html deleted file mode 100644 index 4cdd75d0bd..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl._tf_graph_builders_1x.graph_builders.html +++ /dev/null @@ -1,226 +0,0 @@ - - - - - - - - sparknlp_jsl._tf_graph_builders_1x.graph_builders — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl._tf_graph_builders_1x.graph_builders

-

Classes

- ---- - - - - - - - - - - - - - - - - - - - - -

AssertionTFGraphBuilder

Class to build the the TF graphs for AssertionDLApproach

GenericClassifierTFGraphBuilder

Class to create the the TF graphs for GenericClassifierApproach

NerTFGraphBuilder

Class to build the the TF graphs for MedicalNerApproach.

RelationExtractionTFGraphBuilder

Class to build the the TF graphs for RelationExtractionApproach

TFGraphBuilder

Generic class to create the tensorflow graphs for 'ner_dl', 'generic_classifier', 'assertion_dl', 'relation_extraction' annotators in spark-nlp healthcare

TFGraphBuilderFactory

Factory class to create the the different tensorflow graphs for ner_dl, generic_classifier, assertion_dl, relation_extraction annotators in spark-nlp healthcare

-

Exceptions

- ---- - - - - - -

WrongTFVersion

-
- - -
- - -
- - -
- -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.AnnotationMerger.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.AnnotationMerger.html deleted file mode 100644 index a637b3df7e..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.AnnotationMerger.html +++ /dev/null @@ -1,1044 +0,0 @@ - - - - - - - - sparknlp_jsl.annotator.AnnotationMerger — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.annotator.AnnotationMerger

-
-
-class sparknlp_jsl.annotator.AnnotationMerger(classname='com.johnsnowlabs.nlp.AnnotationMerger', java_model=None)[source]
-

Bases: sparknlp.common.AnnotatorModel

-
-

Merges Annotations from multiple columns.

- ---- - - - - - - - - - - -

Input Annotation types

Output Annotation type

ANY

ANY

-
-
-
Parameters
-
-
inputType

The type of the annotations that you want to merge. Possible values -document|token|wordpiece|word_embeddings|sentence_embeddings|category|date|sentiment|pos|chunk|named_entity|regex|dependency|labeled_dependency|language|keyword

-
-
-
-
-

Examples

-
>>> docs = [[""]]
->>> test_data = spark.createDataFrame(docs).toDF("text")
->>> document1 = DocumentAssembler().setInputCol("text").setOutputCol("document1")
->>> document2 = DocumentAssembler().setInputCol("text").setOutputCol("document2")
->>> annotation_merger = AnnotationMerger()...     .setInputCols("document1", "document2")...     .setInputType("document")...     .setOutputCol("all_docs")
->>>
->>> pipeline = Pipeline().setStages([document1, document2, annotation_merger]).fit(docs)
->>> lp = LightPipeline(pipeline)
->>> lp.fullAnnotate("one doc to be replicated")
-[{'document1': [Annotation(document, 0, 23, one doc to be replicated, {})], 'document2': [Annotation(document, 0, 23, one doc to be replicated, {})], 'all_docs': [Annotation(document, 0, 23, one doc to be replicated, {}), Annotation(document, 0, 23, one doc to be replicated, {})]}]
-
-
-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__([classname, java_model])

Initialize this instance with a Java model object.

clear(param)

Clears a param from the param map if it has been explicitly set.

copy([extra])

Creates a copy of this instance with the same uid and some extra params.

explainParam(param)

Explains a single param and returns its name, doc, and optional default value and user-supplied value in a string.

explainParams()

Returns the documentation of all params with their optionally default values and user-supplied values.

extractParamMap([extra])

Extracts the embedded default param values and user-supplied values, and then merges them with extra values from input into a flat param map, where the latter value is used if there exist conflicts, i.e., with ordering: default param values < user-supplied values < extra.

getInputCols()

Gets current column names of input annotations.

getLazyAnnotator()

Gets whether Annotator should be evaluated lazily in a RecursivePipeline.

getOrDefault(param)

Gets the value of a param in the user-supplied param map or its default value.

getOutputCol()

Gets output column name of annotations.

getParam(paramName)

Gets a param by its name.

getParamValue(paramName)

Gets the value of a parameter.

hasDefault(param)

Checks whether a param has a default value.

hasParam(paramName)

Tests whether this instance contains a param with a given (string) name.

isDefined(param)

Checks whether a param is explicitly set by user or has a default value.

isSet(param)

Checks whether a param is explicitly set by user.

load(path)

Reads an ML instance from the input path, a shortcut of read().load(path).

read()

Returns an MLReader instance for this class.

save(path)

Save this ML instance to the given path, a shortcut of 'write().save(path)'.

set(param, value)

Sets a parameter in the embedded param map.

setInputCols(*value)

Sets column names of input annotations.

setInputType(value)

Sets the type of the entity that you want to filter by default sentence_embedding

setLazyAnnotator(value)

Sets whether Annotator should be evaluated lazily in a RecursivePipeline.

setOutputCol(value)

Sets output column name of annotations.

setParamValue(paramName)

Sets the value of a parameter.

setParams()

transform(dataset[, params])

Transforms the input dataset with optional parameters.

write()

Returns an MLWriter instance for this ML instance.

-

Attributes

- ---- - - - - - - - - - - - - - - - - - - - - - - - -

getter_attrs

inputCols

inputType

lazyAnnotator

name

outputCol

params

Returns all params ordered by name.

-
-
-clear(param)
-

Clears a param from the param map if it has been explicitly set.

-
- -
-
-copy(extra=None)
-

Creates a copy of this instance with the same uid and some -extra params. This implementation first calls Params.copy and -then make a copy of the companion Java pipeline component with -extra params. So both the Python wrapper and the Java pipeline -component get copied.

-
-
Parameters
-
-
extradict, optional

Extra parameters to copy to the new instance

-
-
-
-
Returns
-
-
JavaParams

Copy of this instance

-
-
-
-
-
- -
-
-explainParam(param)
-

Explains a single param and returns its name, doc, and optional -default value and user-supplied value in a string.

-
- -
-
-explainParams()
-

Returns the documentation of all params with their optionally -default values and user-supplied values.

-
- -
-
-extractParamMap(extra=None)
-

Extracts the embedded default param values and user-supplied -values, and then merges them with extra values from input into -a flat param map, where the latter value is used if there exist -conflicts, i.e., with ordering: default param values < -user-supplied values < extra.

-
-
Parameters
-
-
extradict, optional

extra param values

-
-
-
-
Returns
-
-
dict

merged param map

-
-
-
-
-
- -
-
-getInputCols()
-

Gets current column names of input annotations.

-
- -
-
-getLazyAnnotator()
-

Gets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
- -
-
-getOrDefault(param)
-

Gets the value of a param in the user-supplied param map or its -default value. Raises an error if neither is set.

-
- -
-
-getOutputCol()
-

Gets output column name of annotations.

-
- -
-
-getParam(paramName)
-

Gets a param by its name.

-
- -
-
-getParamValue(paramName)
-

Gets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-hasDefault(param)
-

Checks whether a param has a default value.

-
- -
-
-hasParam(paramName)
-

Tests whether this instance contains a param with a given -(string) name.

-
- -
-
-isDefined(param)
-

Checks whether a param is explicitly set by user or has -a default value.

-
- -
-
-isSet(param)
-

Checks whether a param is explicitly set by user.

-
- -
-
-classmethod load(path)
-

Reads an ML instance from the input path, a shortcut of read().load(path).

-
- -
-
-property params
-

Returns all params ordered by name. The default implementation -uses dir() to get all attributes of type -Param.

-
- -
-
-classmethod read()
-

Returns an MLReader instance for this class.

-
- -
-
-save(path)
-

Save this ML instance to the given path, a shortcut of ‘write().save(path)’.

-
- -
-
-set(param, value)
-

Sets a parameter in the embedded param map.

-
- -
-
-setInputCols(*value)
-

Sets column names of input annotations.

-
-
Parameters
-
-
*valuestr

Input columns for the annotator

-
-
-
-
-
- -
-
-setInputType(value)[source]
-

Sets the type of the entity that you want to filter by default sentence_embedding

-
-
Parameters
-
-
valueint

The type of the entity that you want to filter by default sentence_embedding

-
-
-
-
-
- -
-
-setLazyAnnotator(value)
-

Sets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
-
Parameters
-
-
valuebool

Whether Annotator should be evaluated lazily in a -RecursivePipeline

-
-
-
-
-
- -
-
-setOutputCol(value)
-

Sets output column name of annotations.

-
-
Parameters
-
-
valuestr

Name of output column

-
-
-
-
-
- -
-
-setParamValue(paramName)
-

Sets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-transform(dataset, params=None)
-

Transforms the input dataset with optional parameters.

-
-

New in version 1.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset

-
-
paramsdict, optional

an optional param map that overrides embedded params.

-
-
-
-
Returns
-
-
pyspark.sql.DataFrame

transformed dataset

-
-
-
-
-
- -
-
-uid
-

A unique id for the object.

-
- -
-
-write()
-

Returns an MLWriter instance for this ML instance.

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.AssertionDLApproach.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.AssertionDLApproach.html deleted file mode 100644 index f613cd1449..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.AssertionDLApproach.html +++ /dev/null @@ -1,1467 +0,0 @@ - - - - - - - - sparknlp_jsl.annotator.AssertionDLApproach — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.annotator.AssertionDLApproach

-
-
-class sparknlp_jsl.annotator.AssertionDLApproach[source]
-

Bases: sparknlp.common.AnnotatorApproach

-

Train a Assertion Model algorithm using deep learning. -from extracted entities and text. AssertionLogRegModel requires DOCUMENT, CHUNK and WORD_EMBEDDINGS type -annotator inputs, which can be obtained by e.g a

-

The training data should have annotations columns of type DOCUMENT, CHUNK, WORD_EMBEDDINGS, the label column -(The assertion status that you want to predict), the start (the start index for the term that has the assertion status), -the end column (the end index for the term that has the assertion status).This model use a deep learning to predict the entity.

- ---- - - - - - - - - - - -

Input Annotation types

Output Annotation type

DOCUMENT, CHUNK, WORD_EMBEDDINGS

ASSERTION

-
-
Parameters
-
-
label

Column with one label per document. Example of possible values: “present”, “absent”, “hypothetical”, “conditional”, “associated_with_other_person”, etc.

-
-
startCol

Column that contains the token number for the start of the target

-
-
endCol

olumn that contains the token number for the end of the target

-
-
batchSize

Size for each batch in the optimization process

-
-
epochs

Number of epochs for the optimization process

-
-
learningRate

Learning rate for the optimization process

-
-
dropout

dropout”, “Dropout at the output of each layer

-
-
maxSentLen

Max length for an input sentence.

-
-
graphFolder

Folder path that contain external graph files

-
-
configProtoBytes

ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()

-
-
validationSplit

Choose the proportion of training dataset to be validated against the model on each Epoch. The value should be between 0.0 and 1.0 and by default it is 0.0 and off.

-
-
evaluationLogExtended

Select if you want to have mode eval.

-
-
testDataset

Path to test dataset. If set used to calculate statistic on it during training.

-
-
includeConfidence

whether to include confidence scores in annotation metadata

-
-
enableOutputLogs

whether or not to output logs

-
-
outputLogsPath

Folder path to save training logs

-
-
verbose

Level of verbosity during training

-
-
-
-
-

Examples

-
>>> import sparknlp
->>> from sparknlp.base import *
->>> from sparknlp.annotator import *
->>> from sparknlp_jsl.annotator import *
->>> from sparknlp.training import *
->>> from pyspark.ml import Pipeline
->>> document_assembler = DocumentAssembler() \
-...    .setInputCol("text") \
-...    .setOutputCol("document")
->>> sentence_detector = SentenceDetector() \
-...    .setInputCol("document") \
-...    .setOutputCol("sentence")
->>> tokenizer = Tokenizer() \
-...    .setInputCols(["sentence"]) \
-...    .setOutputCol("token")
->>> embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") \
-...    .setInputCols(["sentence", "token"]) \
-...    .setOutputCol("word_embeddings") \
-...    .setCaseSensitive(False)
->>> chunk = Chunker() \
-...    .setInputCols([sentence]) \
-...    .setChunkCol("chunk") \
-...    .setOutputCol("chunk")
->>> assertion = AssertionDLApproach() \
-...    .setLabelCol("label") \
-...    .setInputCols(["document", "chunk", "word_embeddings"]) \
-...    .setOutputCol("assertion") \
-...    .setOutputCol("assertion") \
-...    .setBatchSize(128) \
-...    .setDropout(0.012) \
-...    .setLearningRate(0.015) \
-...    .setEpochs(1) \
-...    .setStartCol("start") \
-...    .setEndCol("end") \
-...    .setMaxSentLen(250)
->>> assertionPipeline = Pipeline(stages=[
-...    document_assembler,
-...    sentence_detector,
-...    tokenizer,
-...    embeddings,
-...    chunk,
-...    assertion])
->>> assertionModel = assertionPipeline.fit(dataset)
-
-
-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__()

clear(param)

Clears a param from the param map if it has been explicitly set.

copy([extra])

Creates a copy of this instance with the same uid and some extra params.

explainParam(param)

Explains a single param and returns its name, doc, and optional default value and user-supplied value in a string.

explainParams()

Returns the documentation of all params with their optionally default values and user-supplied values.

extractParamMap([extra])

Extracts the embedded default param values and user-supplied values, and then merges them with extra values from input into a flat param map, where the latter value is used if there exist conflicts, i.e., with ordering: default param values < user-supplied values < extra.

fit(dataset[, params])

Fits a model to the input dataset with optional parameters.

fitMultiple(dataset, paramMaps)

Fits a model to the input dataset for each param map in paramMaps.

getInputCols()

Gets current column names of input annotations.

getLazyAnnotator()

Gets whether Annotator should be evaluated lazily in a RecursivePipeline.

getOrDefault(param)

Gets the value of a param in the user-supplied param map or its default value.

getOutputCol()

Gets output column name of annotations.

getParam(paramName)

Gets a param by its name.

getParamValue(paramName)

Gets the value of a parameter.

hasDefault(param)

Checks whether a param has a default value.

hasParam(paramName)

Tests whether this instance contains a param with a given (string) name.

isDefined(param)

Checks whether a param is explicitly set by user or has a default value.

isSet(param)

Checks whether a param is explicitly set by user.

load(path)

Reads an ML instance from the input path, a shortcut of read().load(path).

read()

Returns an MLReader instance for this class.

save(path)

Save this ML instance to the given path, a shortcut of 'write().save(path)'.

set(param, value)

Sets a parameter in the embedded param map.

setBatchSize(size)

Set Size for each batch in the optimization process.

setConfigProtoBytes(b)

Sets ConfigProto from tensorflow, serialized into byte array.

setDropout(rate)

Set a dropout at the output of each layer

setEnableOutputLogs(value)

Sets if you enable to output to annotators log folder.

setEndCol(e)

Set column that contains the token number for the end of the target.

setEpochs(number)

Sets number of epochs for the optimization process

setEvaluationLogExtended(v)

Creates a Annotation from a Spark Row.

setGraphFolder(p)

Sets folder path that contain external graph files.

setIncludeConfidence(value)

Sets if you waht to include confidence scores in annotation metadata.

setInputCols(*value)

Sets column names of input annotations.

setLabelCol(label)

Set a column with one label per document.

setLazyAnnotator(value)

Sets whether Annotator should be evaluated lazily in a RecursivePipeline.

setLearningRate(lamda)

Set a learning rate for the optimization process

setMaxSentLen(length)

Set the max length for an input sentence.

setOutputCol(value)

Sets output column name of annotations.

setOutputLogsPath(value)

Sets folder path that contain external graph files.

setParamValue(paramName)

Sets the value of a parameter.

setStartCol(s)

Set a column that contains the token number for the start of the target

setTestDataset(path[, read_as, options])

Sets path to test dataset.

setValidationSplit(v)

Set Choose the proportion of training dataset to be validated against the model on each Epoch.

setVerbose(value)

Sets level of verbosity during training.

write()

Returns an MLWriter instance for this ML instance.

-

Attributes

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

batchSize

configProtoBytes

dropout

enableOutputLogs

endCol

epochs

evaluationLogExtended

getter_attrs

graphFolder

includeConfidence

inputCols

label

lazyAnnotator

learningRate

maxSentLen

outputCol

outputLogsPath

params

Returns all params ordered by name.

startCol

testDataset

validationSplit

verbose

-
-
-clear(param)
-

Clears a param from the param map if it has been explicitly set.

-
- -
-
-copy(extra=None)
-

Creates a copy of this instance with the same uid and some -extra params. This implementation first calls Params.copy and -then make a copy of the companion Java pipeline component with -extra params. So both the Python wrapper and the Java pipeline -component get copied.

-
-
Parameters
-
-
extradict, optional

Extra parameters to copy to the new instance

-
-
-
-
Returns
-
-
JavaParams

Copy of this instance

-
-
-
-
-
- -
-
-explainParam(param)
-

Explains a single param and returns its name, doc, and optional -default value and user-supplied value in a string.

-
- -
-
-explainParams()
-

Returns the documentation of all params with their optionally -default values and user-supplied values.

-
- -
-
-extractParamMap(extra=None)
-

Extracts the embedded default param values and user-supplied -values, and then merges them with extra values from input into -a flat param map, where the latter value is used if there exist -conflicts, i.e., with ordering: default param values < -user-supplied values < extra.

-
-
Parameters
-
-
extradict, optional

extra param values

-
-
-
-
Returns
-
-
dict

merged param map

-
-
-
-
-
- -
-
-fit(dataset, params=None)
-

Fits a model to the input dataset with optional parameters.

-
-

New in version 1.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset.

-
-
paramsdict or list or tuple, optional

an optional param map that overrides embedded params. If a list/tuple of -param maps is given, this calls fit on each param map and returns a list of -models.

-
-
-
-
Returns
-
-
Transformer or a list of Transformer

fitted model(s)

-
-
-
-
-
- -
-
-fitMultiple(dataset, paramMaps)
-

Fits a model to the input dataset for each param map in paramMaps.

-
-

New in version 2.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset.

-
-
paramMapscollections.abc.Sequence

A Sequence of param maps.

-
-
-
-
Returns
-
-
_FitMultipleIterator

A thread safe iterable which contains one model for each param map. Each -call to next(modelIterator) will return (index, model) where model was fit -using paramMaps[index]. index values may not be sequential.

-
-
-
-
-
- -
-
-getInputCols()
-

Gets current column names of input annotations.

-
- -
-
-getLazyAnnotator()
-

Gets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
- -
-
-getOrDefault(param)
-

Gets the value of a param in the user-supplied param map or its -default value. Raises an error if neither is set.

-
- -
-
-getOutputCol()
-

Gets output column name of annotations.

-
- -
-
-getParam(paramName)
-

Gets a param by its name.

-
- -
-
-getParamValue(paramName)
-

Gets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-hasDefault(param)
-

Checks whether a param has a default value.

-
- -
-
-hasParam(paramName)
-

Tests whether this instance contains a param with a given -(string) name.

-
- -
-
-isDefined(param)
-

Checks whether a param is explicitly set by user or has -a default value.

-
- -
-
-isSet(param)
-

Checks whether a param is explicitly set by user.

-
- -
-
-classmethod load(path)
-

Reads an ML instance from the input path, a shortcut of read().load(path).

-
- -
-
-property params
-

Returns all params ordered by name. The default implementation -uses dir() to get all attributes of type -Param.

-
- -
-
-classmethod read()
-

Returns an MLReader instance for this class.

-
- -
-
-save(path)
-

Save this ML instance to the given path, a shortcut of ‘write().save(path)’.

-
- -
-
-set(param, value)
-

Sets a parameter in the embedded param map.

-
- -
-
-setBatchSize(size)[source]
-

Set Size for each batch in the optimization process.

-
-
Parameters
-
-
sizeint

Size for each batch in the optimization process

-
-
-
-
-
- -
-
-setConfigProtoBytes(b)[source]
-

Sets ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()`.

-
-
Parameters
-
-
bbytes

ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()

-
-
-
-
-
- -
-
-setDropout(rate)[source]
-

Set a dropout at the output of each layer

-
-
Parameters
-
-
ratefloat

Dropout at the output of each layer

-
-
-
-
-
- -
-
-setEnableOutputLogs(value)[source]
-

Sets if you enable to output to annotators log folder.

-
-
Parameters
-
-
valuesrt

Folder path that contain external graph files.

-
-
-
-
-
- -
-
-setEndCol(e)[source]
-

Set column that contains the token number for the end of the target.

-
-
Parameters
-
-
rowstr

Column that contains the token number for the end of the target

-
-
-
-
-
- -
-
-setEpochs(number)[source]
-

Sets number of epochs for the optimization process

-
-
Parameters
-
-
numberint

Number of epochs for the optimization process

-
-
-
-
-
- -
-
-setEvaluationLogExtended(v)[source]
-

Creates a Annotation from a Spark Row.

-
-
Parameters
-
-
vbool

Evaluation log extended.

-
-
-
-
-
- -
-
-setGraphFolder(p)[source]
-

Sets folder path that contain external graph files.

-
-
Parameters
-
-
psrt

Folder path that contain external graph files.

-
-
-
-
-
- -
-
-setIncludeConfidence(value)[source]
-

Sets if you waht to include confidence scores in annotation metadata.

-
-
Parameters
-
-
pbool
-
Value that selects if you want to use confidence scores in annotation metadata
-
-
-
-
- -
-
-setInputCols(*value)
-

Sets column names of input annotations.

-
-
Parameters
-
-
*valuestr

Input columns for the annotator

-
-
-
-
-
- -
-
-setLabelCol(label)[source]
-

Set a column with one label per document. Example of possible values: “present”, “absent”, “hypothetical”, “conditional”, “associated_with_other_person”, etc.

-
-
Parameters
-
-
labelstr

label. Column with one label per document. Example of possible values: “present”, “absent”, “hypothetical”, “conditional”, “associated_with_other_person”, etc.

-
-
-
-
-
- -
-
-setLazyAnnotator(value)
-

Sets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
-
Parameters
-
-
valuebool

Whether Annotator should be evaluated lazily in a -RecursivePipeline

-
-
-
-
-
- -
-
-setLearningRate(lamda)[source]
-

Set a learning rate for the optimization process

-
-
Parameters
-
-
lamdafloat

Learning rate for the optimization process.

-
-
-
-
Returns
-
-
Annotation

The new Annotation.

-
-
-
-
-
- -
-
-setMaxSentLen(length)[source]
-

Set the max length for an input sentence.

-
-
Parameters
-
-
lengthint

Max length for an input sentence.

-
-
-
-
-
- -
-
-setOutputCol(value)
-

Sets output column name of annotations.

-
-
Parameters
-
-
valuestr

Name of output column

-
-
-
-
-
- -
-
-setOutputLogsPath(value)[source]
-

Sets folder path that contain external graph files.

-
-
Parameters
-
-
valuesrt

Folder path that contain external graph files.

-
-
-
-
-
- -
-
-setParamValue(paramName)
-

Sets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-setStartCol(s)[source]
-

Set a column that contains the token number for the start of the target

-
-
Parameters
-
-
sstr

Column that contains the token number for the start of the target

-
-
-
-
-
- -
-
-setTestDataset(path, read_as='SPARK', options={'format': 'parquet'})[source]
-

Sets path to test dataset. If set used to calculate statistic on it during training.

-
-
Parameters
-
-
pathsrt

Path to test dataset. If set used to calculate statistic on it during training.

-
-
-
-
-
- -
-
-setValidationSplit(v)[source]
-
-
Set Choose the proportion of training dataset to be validated against the model on each Epoch.

The value should be between 0.0 and 1.0 and by default it is 0.0 and off.

-
-
-
-
Parameters
-
-
vfloat

Choose the proportion of training dataset to be validated against the model on each Epoch. -The value should be between 0.0 and 1.0 and by default it is 0.0 and off.

-
-
-
-
-
- -
-
-setVerbose(value)[source]
-

Sets level of verbosity during training.

-
-
Parameters
-
-
valueint

Level of verbosity during training.

-
-
-
-
-
- -
-
-uid
-

A unique id for the object.

-
- -
-
-write()
-

Returns an MLWriter instance for this ML instance.

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.AssertionDLModel.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.AssertionDLModel.html deleted file mode 100644 index 1f61385ed3..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.AssertionDLModel.html +++ /dev/null @@ -1,1118 +0,0 @@ - - - - - - - - sparknlp_jsl.annotator.AssertionDLModel — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.annotator.AssertionDLModel

-
-
-class sparknlp_jsl.annotator.AssertionDLModel(classname='com.johnsnowlabs.nlp.annotators.assertion.dl.AssertionDLModel', java_model=None)[source]
-

Bases: sparknlp.common.AnnotatorModel, sparknlp.common.HasStorageRef

-

AssertionDL is a deep Learning based approach used to extract Assertion Status -from extracted entities and text. AssertionDLModel requires DOCUMENT, CHUNK and WORD_EMBEDDINGS type -annotator inputs, which can be obtained by e.g a

- ---- - - - - - - - - - - -

Input Annotation types

Output Annotation type

DOCUMENT, CHUNK, WORD_EMBEDDINGS

ASSERTION

-
-
Parameters
-
-
maxSentLen

Max length for an input sentence.

-
-
targetNerLabels

List of NER labels to mark as target for assertion, must match NER output.

-
-
configProtoBytes

ConfigProto from tensorflow, serialized into byte array.

-
-
classes

Tags used to trained this AssertionDLModel

-
-
-
-
-

Examples

-
>>> import sparknlp
->>> from sparknlp.base import *
->>> from sparknlp.common import *
->>> from sparknlp.annotator import *
->>> from sparknlp.training import *
->>> import sparknlp_jsl
->>> from sparknlp_jsl.base import *
->>> from sparknlp_jsl.annotator import *
->>> from pyspark.ml import Pipeline
->>> data = spark.createDataFrame([["Patient with severe fever and sore throat"],["Patient shows no stomach pain"],["She was maintained on an epidural and PCA for pain control."]]).toDF("text")
->>> documentAssembler = DocumentAssembler().setInputCol("text").setOutputCol("document")
->>> sentenceDetector = SentenceDetector().setInputCols(["document"]).setOutputCol("sentence")
->>> tokenizer = Tokenizer().setInputCols(["sentence"]).setOutputCol("token")
->>> embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") \
-...  .setOutputCol("embeddings")
->>> nerModel = MedicalNerModel.pretrained("ner_clinical", "en", "clinical/models") \
-...  .setInputCols(["sentence", "token", "embeddings"]).setOutputCol("ner")
->>> nerConverter = NerConverter().setInputCols(["sentence", "token", "ner"]).setOutputCol("ner_chunk")
->>> clinicalAssertion = AssertionDLModel.pretrained("assertion_dl", "en", "clinical/models") \
-...  .setInputCols(["sentence", "ner_chunk", "embeddings"]) \
-...  .setOutputCol("assertion")
->>> assertionPipeline = Pipeline(stages=[
-...  documentAssembler,
-...  sentenceDetector,
-...  tokenizer,
-...  embeddings,
-...  nerModel,
-...  nerConverter,
-...  clinicalAssertion
-... ])
-
-
-
>>> assertionModel = assertionPipeline.fit(data)
-
-
-
>>> result = assertionModel.transform(data)
->>> result.selectExpr("ner_chunk.result as ner", "assertion.result").show(3, truncate=False)
-+--------------------------------+--------------------------------+
-|ner                             |result                          |
-+--------------------------------+--------------------------------+
-|[severe fever, sore throat]     |[present, present]              |
-|[stomach pain]                  |[absent]                        |
-|[an epidural, PCA, pain control]|[present, present, hypothetical]|
-+--------------------------------+--------------------------------+
-
-
-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__([classname, java_model])

Initialize this instance with a Java model object.

clear(param)

Clears a param from the param map if it has been explicitly set.

copy([extra])

Creates a copy of this instance with the same uid and some extra params.

explainParam(param)

Explains a single param and returns its name, doc, and optional default value and user-supplied value in a string.

explainParams()

Returns the documentation of all params with their optionally default values and user-supplied values.

extractParamMap([extra])

Extracts the embedded default param values and user-supplied values, and then merges them with extra values from input into a flat param map, where the latter value is used if there exist conflicts, i.e., with ordering: default param values < user-supplied values < extra.

getInputCols()

Gets current column names of input annotations.

getLazyAnnotator()

Gets whether Annotator should be evaluated lazily in a RecursivePipeline.

getOrDefault(param)

Gets the value of a param in the user-supplied param map or its default value.

getOutputCol()

Gets output column name of annotations.

getParam(paramName)

Gets a param by its name.

getParamValue(paramName)

Gets the value of a parameter.

getStorageRef()

Gets unique reference name for identification.

hasDefault(param)

Checks whether a param has a default value.

hasParam(paramName)

Tests whether this instance contains a param with a given (string) name.

isDefined(param)

Checks whether a param is explicitly set by user or has a default value.

isSet(param)

Checks whether a param is explicitly set by user.

load(path)

Reads an ML instance from the input path, a shortcut of read().load(path).

pretrained(name[, lang, remote_loc])

read()

Returns an MLReader instance for this class.

save(path)

Save this ML instance to the given path, a shortcut of 'write().save(path)'.

set(param, value)

Sets a parameter in the embedded param map.

setConfigProtoBytes(b)

setInputCols(*value)

Sets column names of input annotations.

setLazyAnnotator(value)

Sets whether Annotator should be evaluated lazily in a RecursivePipeline.

setOutputCol(value)

Sets output column name of annotations.

setParamValue(paramName)

Sets the value of a parameter.

setParams()

setStorageRef(value)

Sets unique reference name for identification.

transform(dataset[, params])

Transforms the input dataset with optional parameters.

write()

Returns an MLWriter instance for this ML instance.

-

Attributes

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

classes

configProtoBytes

getter_attrs

inputCols

lazyAnnotator

maxSentLen

name

outputCol

params

Returns all params ordered by name.

storageRef

targetNerLabels

-
-
-clear(param)
-

Clears a param from the param map if it has been explicitly set.

-
- -
-
-copy(extra=None)
-

Creates a copy of this instance with the same uid and some -extra params. This implementation first calls Params.copy and -then make a copy of the companion Java pipeline component with -extra params. So both the Python wrapper and the Java pipeline -component get copied.

-
-
Parameters
-
-
extradict, optional

Extra parameters to copy to the new instance

-
-
-
-
Returns
-
-
JavaParams

Copy of this instance

-
-
-
-
-
- -
-
-explainParam(param)
-

Explains a single param and returns its name, doc, and optional -default value and user-supplied value in a string.

-
- -
-
-explainParams()
-

Returns the documentation of all params with their optionally -default values and user-supplied values.

-
- -
-
-extractParamMap(extra=None)
-

Extracts the embedded default param values and user-supplied -values, and then merges them with extra values from input into -a flat param map, where the latter value is used if there exist -conflicts, i.e., with ordering: default param values < -user-supplied values < extra.

-
-
Parameters
-
-
extradict, optional

extra param values

-
-
-
-
Returns
-
-
dict

merged param map

-
-
-
-
-
- -
-
-getInputCols()
-

Gets current column names of input annotations.

-
- -
-
-getLazyAnnotator()
-

Gets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
- -
-
-getOrDefault(param)
-

Gets the value of a param in the user-supplied param map or its -default value. Raises an error if neither is set.

-
- -
-
-getOutputCol()
-

Gets output column name of annotations.

-
- -
-
-getParam(paramName)
-

Gets a param by its name.

-
- -
-
-getParamValue(paramName)
-

Gets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-getStorageRef()
-

Gets unique reference name for identification.

-
-
Returns
-
-
str

Unique reference name for identification

-
-
-
-
-
- -
-
-hasDefault(param)
-

Checks whether a param has a default value.

-
- -
-
-hasParam(paramName)
-

Tests whether this instance contains a param with a given -(string) name.

-
- -
-
-isDefined(param)
-

Checks whether a param is explicitly set by user or has -a default value.

-
- -
-
-isSet(param)
-

Checks whether a param is explicitly set by user.

-
- -
-
-classmethod load(path)
-

Reads an ML instance from the input path, a shortcut of read().load(path).

-
- -
-
-property params
-

Returns all params ordered by name. The default implementation -uses dir() to get all attributes of type -Param.

-
- -
-
-classmethod read()
-

Returns an MLReader instance for this class.

-
- -
-
-save(path)
-

Save this ML instance to the given path, a shortcut of ‘write().save(path)’.

-
- -
-
-set(param, value)
-

Sets a parameter in the embedded param map.

-
- -
-
-setInputCols(*value)
-

Sets column names of input annotations.

-
-
Parameters
-
-
*valuestr

Input columns for the annotator

-
-
-
-
-
- -
-
-setLazyAnnotator(value)
-

Sets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
-
Parameters
-
-
valuebool

Whether Annotator should be evaluated lazily in a -RecursivePipeline

-
-
-
-
-
- -
-
-setOutputCol(value)
-

Sets output column name of annotations.

-
-
Parameters
-
-
valuestr

Name of output column

-
-
-
-
-
- -
-
-setParamValue(paramName)
-

Sets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-setStorageRef(value)
-

Sets unique reference name for identification.

-
-
Parameters
-
-
valuestr

Unique reference name for identification

-
-
-
-
-
- -
-
-transform(dataset, params=None)
-

Transforms the input dataset with optional parameters.

-
-

New in version 1.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset

-
-
paramsdict, optional

an optional param map that overrides embedded params.

-
-
-
-
Returns
-
-
pyspark.sql.DataFrame

transformed dataset

-
-
-
-
-
- -
-
-uid
-

A unique id for the object.

-
- -
-
-write()
-

Returns an MLWriter instance for this ML instance.

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.AssertionFilterer.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.AssertionFilterer.html deleted file mode 100644 index 278b883ff1..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.AssertionFilterer.html +++ /dev/null @@ -1,1138 +0,0 @@ - - - - - - - - sparknlp_jsl.annotator.AssertionFilterer — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.annotator.AssertionFilterer

-
-
-class sparknlp_jsl.annotator.AssertionFilterer(classname='com.johnsnowlabs.nlp.annotators.chunker.AssertionFilterer', java_model=None)[source]
-

Bases: sparknlp.common.AnnotatorModel

-

Filters entities coming from ASSERTION type annotations and returns the CHUNKS. -Filters can be set via a white list on the extracted chunk, the assertion or a regular expression. -White list for assertion is enabled by default. To use chunk white list, criteria has to be set to isin. -For regex, criteria has to be set to regex.

- ---- - - - - - - - - - - -

Input Annotation types

Output Annotation type

DOCUMENT, CHUNK, ASSERTION

CHUNK

-
-
Parameters
-
-
whiteList

If defined, list of entities to process. The rest will be ignored

-
-
regex

If defined, list of entities to process. The rest will be ignored.

-
-
criteria
-
Tag representing what is the criteria to filter the chunks. possibles values (assertion|isIn|regex)

assertion: Filter by the assertion -isIn : Filter by the chunk -regex : Filter using a regex

-
-
-
-
entitiesConfidence

Entity pairs to remove based on the confidence level

-
-
-
-
-

Examples

-
>>> import sparknlp
->>> from sparknlp.base import *
->>> from sparknlp.common import *
->>> from sparknlp.annotator import *
->>> from sparknlp.training import *
->>> import sparknlp_jsl
->>> from sparknlp_jsl.base import *
->>> from sparknlp_jsl.annotator import *
->>> from pyspark.ml import Pipeline
- To see how the assertions are extracted, see the example for AssertionDLModel.
- Define an extra step where the assertions are filtered
->>> assertionFilterer = AssertionFilterer() \
-...   .setInputCols(["sentence","ner_chunk","assertion"]) \
-...    .setOutputCol("filtered") \
-...   .setCriteria("assertion") \
-...   .setWhiteList(["present"])
-...
->>> assertionPipeline = Pipeline(stages=[
-...   documentAssembler,
-...   sentenceDetector,
-...   tokenizer,
-...   embeddings,
-...   nerModel,
-...   nerConverter,
-...   clinicalAssertion,
-...   assertionFilterer
-... ])
-...
->>> assertionModel = assertionPipeline.fit(data)
->>> result = assertionModel.transform(data)
-
-
-
>>> result.selectExpr("ner_chunk.result", "assertion.result").show(3, truncate=False)
-+--------------------------------+--------------------------------+
-|result                          |result                          |
-+--------------------------------+--------------------------------+
-|[severe fever, sore throat]     |[present, present]              |
-|[stomach pain]                  |[absent]                        |
-|[an epidural, PCA, pain control]|[present, present, hypothetical]|
-+--------------------------------+--------------------------------+
-
-
-
>>> result.select("filtered.result").show(3, truncate=False)
-+---------------------------+
-|result                     |
-+---------------------------+
-|[severe fever, sore throat]|
-|[]                         |
-|[an epidural, PCA]         |
-+---------------------------+
-
-
-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__([classname, java_model])

Initialize this instance with a Java model object.

clear(param)

Clears a param from the param map if it has been explicitly set.

copy([extra])

Creates a copy of this instance with the same uid and some extra params.

explainParam(param)

Explains a single param and returns its name, doc, and optional default value and user-supplied value in a string.

explainParams()

Returns the documentation of all params with their optionally default values and user-supplied values.

extractParamMap([extra])

Extracts the embedded default param values and user-supplied values, and then merges them with extra values from input into a flat param map, where the latter value is used if there exist conflicts, i.e., with ordering: default param values < user-supplied values < extra.

getInputCols()

Gets current column names of input annotations.

getLazyAnnotator()

Gets whether Annotator should be evaluated lazily in a RecursivePipeline.

getOrDefault(param)

Gets the value of a param in the user-supplied param map or its default value.

getOutputCol()

Gets output column name of annotations.

getParam(paramName)

Gets a param by its name.

getParamValue(paramName)

Gets the value of a parameter.

hasDefault(param)

Checks whether a param has a default value.

hasParam(paramName)

Tests whether this instance contains a param with a given (string) name.

isDefined(param)

Checks whether a param is explicitly set by user or has a default value.

isSet(param)

Checks whether a param is explicitly set by user.

load(path)

Reads an ML instance from the input path, a shortcut of read().load(path).

read()

Returns an MLReader instance for this class.

save(path)

Save this ML instance to the given path, a shortcut of 'write().save(path)'.

set(param, value)

Sets a parameter in the embedded param map.

setCriteria(s)

Set tag representing what is the criteria to filter the chunks.

setInputCols(*value)

Sets column names of input annotations.

setLazyAnnotator(value)

Sets whether Annotator should be evaluated lazily in a RecursivePipeline.

setOutputCol(value)

Sets output column name of annotations.

setParamValue(paramName)

Sets the value of a parameter.

setParams()

setRegex(value)

Sets llist of regex to process.

setWhiteList(value)

Sets list of entities to process.

transform(dataset[, params])

Transforms the input dataset with optional parameters.

write()

Returns an MLWriter instance for this ML instance.

-

Attributes

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

criteria

entitiesConfidence

getter_attrs

inputCols

lazyAnnotator

name

outputCol

params

Returns all params ordered by name.

regex

whiteList

-
-
-clear(param)
-

Clears a param from the param map if it has been explicitly set.

-
- -
-
-copy(extra=None)
-

Creates a copy of this instance with the same uid and some -extra params. This implementation first calls Params.copy and -then make a copy of the companion Java pipeline component with -extra params. So both the Python wrapper and the Java pipeline -component get copied.

-
-
Parameters
-
-
extradict, optional

Extra parameters to copy to the new instance

-
-
-
-
Returns
-
-
JavaParams

Copy of this instance

-
-
-
-
-
- -
-
-explainParam(param)
-

Explains a single param and returns its name, doc, and optional -default value and user-supplied value in a string.

-
- -
-
-explainParams()
-

Returns the documentation of all params with their optionally -default values and user-supplied values.

-
- -
-
-extractParamMap(extra=None)
-

Extracts the embedded default param values and user-supplied -values, and then merges them with extra values from input into -a flat param map, where the latter value is used if there exist -conflicts, i.e., with ordering: default param values < -user-supplied values < extra.

-
-
Parameters
-
-
extradict, optional

extra param values

-
-
-
-
Returns
-
-
dict

merged param map

-
-
-
-
-
- -
-
-getInputCols()
-

Gets current column names of input annotations.

-
- -
-
-getLazyAnnotator()
-

Gets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
- -
-
-getOrDefault(param)
-

Gets the value of a param in the user-supplied param map or its -default value. Raises an error if neither is set.

-
- -
-
-getOutputCol()
-

Gets output column name of annotations.

-
- -
-
-getParam(paramName)
-

Gets a param by its name.

-
- -
-
-getParamValue(paramName)
-

Gets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-hasDefault(param)
-

Checks whether a param has a default value.

-
- -
-
-hasParam(paramName)
-

Tests whether this instance contains a param with a given -(string) name.

-
- -
-
-isDefined(param)
-

Checks whether a param is explicitly set by user or has -a default value.

-
- -
-
-isSet(param)
-

Checks whether a param is explicitly set by user.

-
- -
-
-classmethod load(path)
-

Reads an ML instance from the input path, a shortcut of read().load(path).

-
- -
-
-property params
-

Returns all params ordered by name. The default implementation -uses dir() to get all attributes of type -Param.

-
- -
-
-classmethod read()
-

Returns an MLReader instance for this class.

-
- -
-
-save(path)
-

Save this ML instance to the given path, a shortcut of ‘write().save(path)’.

-
- -
-
-set(param, value)
-

Sets a parameter in the embedded param map.

-
- -
-
-setCriteria(s)[source]
-

Set tag representing what is the criteria to filter the chunks. possibles values (assertion|isIn|regex)

-
-
Parameters
-
-
pairslist

List of dash-separated pairs of named entities

-
-
-
-
-
- -
-
-setInputCols(*value)
-

Sets column names of input annotations.

-
-
Parameters
-
-
*valuestr

Input columns for the annotator

-
-
-
-
-
- -
-
-setLazyAnnotator(value)
-

Sets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
-
Parameters
-
-
valuebool

Whether Annotator should be evaluated lazily in a -RecursivePipeline

-
-
-
-
-
- -
-
-setOutputCol(value)
-

Sets output column name of annotations.

-
-
Parameters
-
-
valuestr

Name of output column

-
-
-
-
-
- -
-
-setParamValue(paramName)
-

Sets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-setRegex(value)[source]
-

Sets llist of regex to process. The rest will be ignored.

-
-
Parameters
-
-
valuelist

List of dash-separated pairs of named entities

-
-
-
-
-
- -
-
-setWhiteList(value)[source]
-

Sets list of entities to process. The rest will be ignored.

-
-
Parameters
-
-
valuelist

If defined, list of entities to process. The rest will be ignored.

-
-
-
-
-
- -
-
-transform(dataset, params=None)
-

Transforms the input dataset with optional parameters.

-
-

New in version 1.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset

-
-
paramsdict, optional

an optional param map that overrides embedded params.

-
-
-
-
Returns
-
-
pyspark.sql.DataFrame

transformed dataset

-
-
-
-
-
- -
-
-uid
-

A unique id for the object.

-
- -
-
-write()
-

Returns an MLWriter instance for this ML instance.

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.AssertionLogRegApproach.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.AssertionLogRegApproach.html deleted file mode 100644 index 098e8468ed..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.AssertionLogRegApproach.html +++ /dev/null @@ -1,1169 +0,0 @@ - - - - - - - - sparknlp_jsl.annotator.AssertionLogRegApproach — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.annotator.AssertionLogRegApproach

-
-
-class sparknlp_jsl.annotator.AssertionLogRegApproach[source]
-

Bases: sparknlp.common.AnnotatorApproach

-

Train a Assertion algorithm using a regression log model.

-

Excluding the label, this can be done with for example: -- a :class: SentenceDetector, -- a :class: Chunk, -- a :class: WordEmbeddingsModel.

- ---- - - - - - - - - - - -

Input Annotation types

Output Annotation type

DOCUMENT, CHUNK, WORD_EMBEDDINGS

ASSERTION

-
-
Parameters
-
-
label

Column with label per each token

-
-
maxIter

Max number of iterations for algorithm

-
-
regParam

Regularization parameter

-
-
eNetParam

Elastic net parameter

-
-
beforeParam

Length of the context before the target

-
-
afterParam

Length of the context after the target

-
-
startCol

Column that contains the token number for the start of the target”

-
-
externalFeatures

Additional dictionaries paths to use as a features

-
-
endCol

Column that contains the token number for the end of the target

-
-
nerCol

Column with NER type annotation output, use either nerCol or startCol and endCol

-
-
targetNerLabels

List of NER labels to mark as target for assertion, must match NER output

-
-
-
-
-

Examples

-
>>> import sparknlp
->>> from sparknlp.base import *
->>> from sparknlp.annotator import *
->>> from sparknlp_jsl.annotator import *
->>> from sparknlp.training import *
->>> from pyspark.ml import Pipeline
->>> document_assembler = DocumentAssembler() \
-...    .setInputCol("text") \
-...    .setOutputCol("document")
-...
->>> sentence_detector = SentenceDetector() \
-...    .setInputCol("document") \
-...    .setOutputCol("sentence")
-...
->>> tokenizer = Tokenizer() \
-...    .setInputCols(["sentence"]) \
-...    .setOutputCol("token")
-...
->>> glove = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") \
-...    .setInputCols(["sentence", "token"]) \
-...    .setOutputCol("word_embeddings")     ...
->>> chunk = Chunker() \
-...    .setInputCols([sentence]) \
-...    .setChunkCol("chunk") \
-...    .setOutputCol("chunk")
-...
-Then the AssertionLogRegApproach model is defined. Label column is needed in the dataset for training.
->>> assertion = AssertionLogRegApproach() \
-...    .setLabelCol("label") \
-...    .setInputCols(["document", "chunk", "word_embeddings"]) \
-...    .setOutputCol("assertion") \
-...    .setReg(0.01) \
-...    .setBefore(11) \
-...    .setAfter(13) \
-...    .setStartCol("start") \
-...    .setEndCol("end")
-...
->>> assertionPipeline = Pipeline(stages=[
-...    document_assembler,
-...    sentence_detector,
-...    tokenizer,
-...    glove,
-...    chunk,
-...    assertion
-...])
-
-
-
>>> assertionModel = assertionPipeline.fit(dataset)
-
-
-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__()

clear(param)

Clears a param from the param map if it has been explicitly set.

copy([extra])

Creates a copy of this instance with the same uid and some extra params.

explainParam(param)

Explains a single param and returns its name, doc, and optional default value and user-supplied value in a string.

explainParams()

Returns the documentation of all params with their optionally default values and user-supplied values.

extractParamMap([extra])

Extracts the embedded default param values and user-supplied values, and then merges them with extra values from input into a flat param map, where the latter value is used if there exist conflicts, i.e., with ordering: default param values < user-supplied values < extra.

fit(dataset[, params])

Fits a model to the input dataset with optional parameters.

fitMultiple(dataset, paramMaps)

Fits a model to the input dataset for each param map in paramMaps.

getInputCols()

Gets current column names of input annotations.

getLazyAnnotator()

Gets whether Annotator should be evaluated lazily in a RecursivePipeline.

getOrDefault(param)

Gets the value of a param in the user-supplied param map or its default value.

getOutputCol()

Gets output column name of annotations.

getParam(paramName)

Gets a param by its name.

getParamValue(paramName)

Gets the value of a parameter.

hasDefault(param)

Checks whether a param has a default value.

hasParam(paramName)

Tests whether this instance contains a param with a given (string) name.

isDefined(param)

Checks whether a param is explicitly set by user or has a default value.

isSet(param)

Checks whether a param is explicitly set by user.

load(path)

Reads an ML instance from the input path, a shortcut of read().load(path).

read()

Returns an MLReader instance for this class.

save(path)

Save this ML instance to the given path, a shortcut of 'write().save(path)'.

set(param, value)

Sets a parameter in the embedded param map.

setAfter(after)

setBefore(before)

setEndCol(e)

setEnet(enet)

setInputCols(*value)

Sets column names of input annotations.

setLabelCol(label)

setLazyAnnotator(value)

Sets whether Annotator should be evaluated lazily in a RecursivePipeline.

setMaxIter(maxiter)

setNerCol(n)

setOutputCol(value)

Sets output column name of annotations.

setParamValue(paramName)

Sets the value of a parameter.

setReg(lamda)

setStartCol(s)

setTargetNerLabels(v)

write()

Returns an MLWriter instance for this ML instance.

-

Attributes

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

afterParam

beforeParam

eNetParam

endCol

getter_attrs

inputCols

label

lazyAnnotator

maxIter

nerCol

outputCol

params

Returns all params ordered by name.

regParam

startCol

targetNerLabels

-
-
-clear(param)
-

Clears a param from the param map if it has been explicitly set.

-
- -
-
-copy(extra=None)
-

Creates a copy of this instance with the same uid and some -extra params. This implementation first calls Params.copy and -then make a copy of the companion Java pipeline component with -extra params. So both the Python wrapper and the Java pipeline -component get copied.

-
-
Parameters
-
-
extradict, optional

Extra parameters to copy to the new instance

-
-
-
-
Returns
-
-
JavaParams

Copy of this instance

-
-
-
-
-
- -
-
-explainParam(param)
-

Explains a single param and returns its name, doc, and optional -default value and user-supplied value in a string.

-
- -
-
-explainParams()
-

Returns the documentation of all params with their optionally -default values and user-supplied values.

-
- -
-
-extractParamMap(extra=None)
-

Extracts the embedded default param values and user-supplied -values, and then merges them with extra values from input into -a flat param map, where the latter value is used if there exist -conflicts, i.e., with ordering: default param values < -user-supplied values < extra.

-
-
Parameters
-
-
extradict, optional

extra param values

-
-
-
-
Returns
-
-
dict

merged param map

-
-
-
-
-
- -
-
-fit(dataset, params=None)
-

Fits a model to the input dataset with optional parameters.

-
-

New in version 1.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset.

-
-
paramsdict or list or tuple, optional

an optional param map that overrides embedded params. If a list/tuple of -param maps is given, this calls fit on each param map and returns a list of -models.

-
-
-
-
Returns
-
-
Transformer or a list of Transformer

fitted model(s)

-
-
-
-
-
- -
-
-fitMultiple(dataset, paramMaps)
-

Fits a model to the input dataset for each param map in paramMaps.

-
-

New in version 2.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset.

-
-
paramMapscollections.abc.Sequence

A Sequence of param maps.

-
-
-
-
Returns
-
-
_FitMultipleIterator

A thread safe iterable which contains one model for each param map. Each -call to next(modelIterator) will return (index, model) where model was fit -using paramMaps[index]. index values may not be sequential.

-
-
-
-
-
- -
-
-getInputCols()
-

Gets current column names of input annotations.

-
- -
-
-getLazyAnnotator()
-

Gets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
- -
-
-getOrDefault(param)
-

Gets the value of a param in the user-supplied param map or its -default value. Raises an error if neither is set.

-
- -
-
-getOutputCol()
-

Gets output column name of annotations.

-
- -
-
-getParam(paramName)
-

Gets a param by its name.

-
- -
-
-getParamValue(paramName)
-

Gets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-hasDefault(param)
-

Checks whether a param has a default value.

-
- -
-
-hasParam(paramName)
-

Tests whether this instance contains a param with a given -(string) name.

-
- -
-
-isDefined(param)
-

Checks whether a param is explicitly set by user or has -a default value.

-
- -
-
-isSet(param)
-

Checks whether a param is explicitly set by user.

-
- -
-
-classmethod load(path)
-

Reads an ML instance from the input path, a shortcut of read().load(path).

-
- -
-
-property params
-

Returns all params ordered by name. The default implementation -uses dir() to get all attributes of type -Param.

-
- -
-
-classmethod read()
-

Returns an MLReader instance for this class.

-
- -
-
-save(path)
-

Save this ML instance to the given path, a shortcut of ‘write().save(path)’.

-
- -
-
-set(param, value)
-

Sets a parameter in the embedded param map.

-
- -
-
-setInputCols(*value)
-

Sets column names of input annotations.

-
-
Parameters
-
-
*valuestr

Input columns for the annotator

-
-
-
-
-
- -
-
-setLazyAnnotator(value)
-

Sets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
-
Parameters
-
-
valuebool

Whether Annotator should be evaluated lazily in a -RecursivePipeline

-
-
-
-
-
- -
-
-setOutputCol(value)
-

Sets output column name of annotations.

-
-
Parameters
-
-
valuestr

Name of output column

-
-
-
-
-
- -
-
-setParamValue(paramName)
-

Sets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-uid
-

A unique id for the object.

-
- -
-
-write()
-

Returns an MLWriter instance for this ML instance.

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.AssertionLogRegModel.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.AssertionLogRegModel.html deleted file mode 100644 index c0dc42da51..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.AssertionLogRegModel.html +++ /dev/null @@ -1,1136 +0,0 @@ - - - - - - - - sparknlp_jsl.annotator.AssertionLogRegModel — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.annotator.AssertionLogRegModel

-
-
-class sparknlp_jsl.annotator.AssertionLogRegModel(classname='com.johnsnowlabs.nlp.annotators.assertion.logreg.AssertionLogRegModel', java_model=None)[source]
-

Bases: sparknlp.common.AnnotatorModel, sparknlp.common.HasStorageRef

-
-
This is a main class in AssertionLogReg family. Logarithmic Regression is used to extract Assertion Status

from extracted entities and text. AssertionLogRegModel requires DOCUMENT, CHUNK and WORD_EMBEDDINGS type -annotator inputs, which can be obtained by e.g a

-
-
-

Excluding the label, this can be done with for example:

-
    -
  • a SentenceDetector,

  • -
  • a Chunk,

  • -
  • a WordEmbeddingsModel.

  • -
- ---- - - - - - - - - - - -

Input Annotation types

Output Annotation type

DOCUMENT, CHUNK, WORD_EMBEDDINGS

ASSERTION

-
-
Parameters
-
-
beforeParam

Length of the context before the target

-
-
afterParam

Length of the context after the target

-
-
startCol

Column that contains the token number for the start of the target”

-
-
endCol

Column that contains the token number for the end of the target

-
-
nerCol

Column with NER type annotation output, use either nerCol or startCol and endCol

-
-
targetNerLabels
-
-
-
-

Examples

-
>>> import sparknlp
->>> from sparknlp.base import *
->>> from sparknlp.annotator import *
->>> from sparknlp_jsl.annotator import *
->>> from sparknlp.training import *
->>> from pyspark.ml import Pipeline
-
-
-
>>> document_assembler = DocumentAssembler() \
-...    .setInputCol("text") \
-...    .setOutputCol("document")
-...
->>> sentence_detector = SentenceDetector() \
-...    .setInputCol("document") \
-...    .setOutputCol("sentence")
-...
->>> tokenizer = Tokenizer() \
-...    .setInputCols(["sentence"]) \
-...    .setOutputCol("token")
-...
->>> embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") \
-...    .setInputCols(["sentence", "token"]) \
-...    .setOutputCol("word_embeddings")     ...    .setCaseSensitive(False)
-...
->>> chunk = Chunker() \
-...    .setInputCols([sentence]) \
-...    .setChunkCol("chunk") \
-...    .setOutputCol("chunk")
-...
-Then the AssertionLogRegApproach model is defined. Label column is needed in the dataset for training.
->>> assertion = AssertionLogRegModel().pretrained() \
-...    .setLabelCol("label") \
-...    .setInputCols(["document", "chunk", "word_embeddings"]) \
-...    .setOutputCol("assertion") \
-...
-...
->>> assertionPipeline = Pipeline(stages=[
-...    document_assembler,
-...    sentence_detector,
-...    tokenizer,
-...    embeddings,
-...    chunk,
-...    assertion
->>>])
-
-
-
>>> assertionModel = assertionPipeline.fit(dataset)
->>> assertionPretrained = assertionModel.transform(dataset)
-
-
-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__([classname, java_model])

Initialize this instance with a Java model object.

clear(param)

Clears a param from the param map if it has been explicitly set.

copy([extra])

Creates a copy of this instance with the same uid and some extra params.

explainParam(param)

Explains a single param and returns its name, doc, and optional default value and user-supplied value in a string.

explainParams()

Returns the documentation of all params with their optionally default values and user-supplied values.

extractParamMap([extra])

Extracts the embedded default param values and user-supplied values, and then merges them with extra values from input into a flat param map, where the latter value is used if there exist conflicts, i.e., with ordering: default param values < user-supplied values < extra.

getInputCols()

Gets current column names of input annotations.

getLazyAnnotator()

Gets whether Annotator should be evaluated lazily in a RecursivePipeline.

getOrDefault(param)

Gets the value of a param in the user-supplied param map or its default value.

getOutputCol()

Gets output column name of annotations.

getParam(paramName)

Gets a param by its name.

getParamValue(paramName)

Gets the value of a parameter.

getStorageRef()

Gets unique reference name for identification.

hasDefault(param)

Checks whether a param has a default value.

hasParam(paramName)

Tests whether this instance contains a param with a given (string) name.

isDefined(param)

Checks whether a param is explicitly set by user or has a default value.

isSet(param)

Checks whether a param is explicitly set by user.

load(path)

Reads an ML instance from the input path, a shortcut of read().load(path).

pretrained(name[, lang, remote_loc])

read()

Returns an MLReader instance for this class.

save(path)

Save this ML instance to the given path, a shortcut of 'write().save(path)'.

set(param, value)

Sets a parameter in the embedded param map.

setInputCols(*value)

Sets column names of input annotations.

setLazyAnnotator(value)

Sets whether Annotator should be evaluated lazily in a RecursivePipeline.

setOutputCol(value)

Sets output column name of annotations.

setParamValue(paramName)

Sets the value of a parameter.

setParams()

setStorageRef(value)

Sets unique reference name for identification.

transform(dataset[, params])

Transforms the input dataset with optional parameters.

write()

Returns an MLWriter instance for this ML instance.

-

Attributes

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

afterParam

beforeParam

endCol

getter_attrs

inputCols

lazyAnnotator

name

nerCol

outputCol

params

Returns all params ordered by name.

startCol

storageRef

targetNerLabels

-
-
-clear(param)
-

Clears a param from the param map if it has been explicitly set.

-
- -
-
-copy(extra=None)
-

Creates a copy of this instance with the same uid and some -extra params. This implementation first calls Params.copy and -then make a copy of the companion Java pipeline component with -extra params. So both the Python wrapper and the Java pipeline -component get copied.

-
-
Parameters
-
-
extradict, optional

Extra parameters to copy to the new instance

-
-
-
-
Returns
-
-
JavaParams

Copy of this instance

-
-
-
-
-
- -
-
-explainParam(param)
-

Explains a single param and returns its name, doc, and optional -default value and user-supplied value in a string.

-
- -
-
-explainParams()
-

Returns the documentation of all params with their optionally -default values and user-supplied values.

-
- -
-
-extractParamMap(extra=None)
-

Extracts the embedded default param values and user-supplied -values, and then merges them with extra values from input into -a flat param map, where the latter value is used if there exist -conflicts, i.e., with ordering: default param values < -user-supplied values < extra.

-
-
Parameters
-
-
extradict, optional

extra param values

-
-
-
-
Returns
-
-
dict

merged param map

-
-
-
-
-
- -
-
-getInputCols()
-

Gets current column names of input annotations.

-
- -
-
-getLazyAnnotator()
-

Gets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
- -
-
-getOrDefault(param)
-

Gets the value of a param in the user-supplied param map or its -default value. Raises an error if neither is set.

-
- -
-
-getOutputCol()
-

Gets output column name of annotations.

-
- -
-
-getParam(paramName)
-

Gets a param by its name.

-
- -
-
-getParamValue(paramName)
-

Gets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-getStorageRef()
-

Gets unique reference name for identification.

-
-
Returns
-
-
str

Unique reference name for identification

-
-
-
-
-
- -
-
-hasDefault(param)
-

Checks whether a param has a default value.

-
- -
-
-hasParam(paramName)
-

Tests whether this instance contains a param with a given -(string) name.

-
- -
-
-isDefined(param)
-

Checks whether a param is explicitly set by user or has -a default value.

-
- -
-
-isSet(param)
-

Checks whether a param is explicitly set by user.

-
- -
-
-classmethod load(path)
-

Reads an ML instance from the input path, a shortcut of read().load(path).

-
- -
-
-property params
-

Returns all params ordered by name. The default implementation -uses dir() to get all attributes of type -Param.

-
- -
-
-classmethod read()
-

Returns an MLReader instance for this class.

-
- -
-
-save(path)
-

Save this ML instance to the given path, a shortcut of ‘write().save(path)’.

-
- -
-
-set(param, value)
-

Sets a parameter in the embedded param map.

-
- -
-
-setInputCols(*value)
-

Sets column names of input annotations.

-
-
Parameters
-
-
*valuestr

Input columns for the annotator

-
-
-
-
-
- -
-
-setLazyAnnotator(value)
-

Sets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
-
Parameters
-
-
valuebool

Whether Annotator should be evaluated lazily in a -RecursivePipeline

-
-
-
-
-
- -
-
-setOutputCol(value)
-

Sets output column name of annotations.

-
-
Parameters
-
-
valuestr

Name of output column

-
-
-
-
-
- -
-
-setParamValue(paramName)
-

Sets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-setStorageRef(value)
-

Sets unique reference name for identification.

-
-
Parameters
-
-
valuestr

Unique reference name for identification

-
-
-
-
-
- -
-
-transform(dataset, params=None)
-

Transforms the input dataset with optional parameters.

-
-

New in version 1.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset

-
-
paramsdict, optional

an optional param map that overrides embedded params.

-
-
-
-
Returns
-
-
pyspark.sql.DataFrame

transformed dataset

-
-
-
-
-
- -
-
-uid
-

A unique id for the object.

-
- -
-
-write()
-

Returns an MLWriter instance for this ML instance.

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.AverageEmbeddings.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.AverageEmbeddings.html deleted file mode 100644 index 8cc5117cc2..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.AverageEmbeddings.html +++ /dev/null @@ -1,983 +0,0 @@ - - - - - - - - sparknlp_jsl.annotator.AverageEmbeddings — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.annotator.AverageEmbeddings

-
-
-class sparknlp_jsl.annotator.AverageEmbeddings(classname='com.johnsnowlabs.nlp.annotators.embeddings.AverageEmbeddings', java_model=None)[source]
-

Bases: sparknlp.common.AnnotatorModel

-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__([classname, java_model])

Initialize this instance with a Java model object.

clear(param)

Clears a param from the param map if it has been explicitly set.

copy([extra])

Creates a copy of this instance with the same uid and some extra params.

explainParam(param)

Explains a single param and returns its name, doc, and optional default value and user-supplied value in a string.

explainParams()

Returns the documentation of all params with their optionally default values and user-supplied values.

extractParamMap([extra])

Extracts the embedded default param values and user-supplied values, and then merges them with extra values from input into a flat param map, where the latter value is used if there exist conflicts, i.e., with ordering: default param values < user-supplied values < extra.

getInputCols()

Gets current column names of input annotations.

getLazyAnnotator()

Gets whether Annotator should be evaluated lazily in a RecursivePipeline.

getOrDefault(param)

Gets the value of a param in the user-supplied param map or its default value.

getOutputCol()

Gets output column name of annotations.

getParam(paramName)

Gets a param by its name.

getParamValue(paramName)

Gets the value of a parameter.

hasDefault(param)

Checks whether a param has a default value.

hasParam(paramName)

Tests whether this instance contains a param with a given (string) name.

isDefined(param)

Checks whether a param is explicitly set by user or has a default value.

isSet(param)

Checks whether a param is explicitly set by user.

load(path)

Reads an ML instance from the input path, a shortcut of read().load(path).

read()

Returns an MLReader instance for this class.

save(path)

Save this ML instance to the given path, a shortcut of 'write().save(path)'.

set(param, value)

Sets a parameter in the embedded param map.

setInputCols(*value)

Sets column names of input annotations.

setLazyAnnotator(value)

Sets whether Annotator should be evaluated lazily in a RecursivePipeline.

setOutputCol(value)

Sets output column name of annotations.

setParamValue(paramName)

Sets the value of a parameter.

setParams()

transform(dataset[, params])

Transforms the input dataset with optional parameters.

write()

Returns an MLWriter instance for this ML instance.

-

Attributes

- ---- - - - - - - - - - - - - - - - - - - - - -

getter_attrs

inputCols

lazyAnnotator

name

outputCol

params

Returns all params ordered by name.

-
-
-clear(param)
-

Clears a param from the param map if it has been explicitly set.

-
- -
-
-copy(extra=None)
-

Creates a copy of this instance with the same uid and some -extra params. This implementation first calls Params.copy and -then make a copy of the companion Java pipeline component with -extra params. So both the Python wrapper and the Java pipeline -component get copied.

-
-
Parameters
-
-
extradict, optional

Extra parameters to copy to the new instance

-
-
-
-
Returns
-
-
JavaParams

Copy of this instance

-
-
-
-
-
- -
-
-explainParam(param)
-

Explains a single param and returns its name, doc, and optional -default value and user-supplied value in a string.

-
- -
-
-explainParams()
-

Returns the documentation of all params with their optionally -default values and user-supplied values.

-
- -
-
-extractParamMap(extra=None)
-

Extracts the embedded default param values and user-supplied -values, and then merges them with extra values from input into -a flat param map, where the latter value is used if there exist -conflicts, i.e., with ordering: default param values < -user-supplied values < extra.

-
-
Parameters
-
-
extradict, optional

extra param values

-
-
-
-
Returns
-
-
dict

merged param map

-
-
-
-
-
- -
-
-getInputCols()
-

Gets current column names of input annotations.

-
- -
-
-getLazyAnnotator()
-

Gets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
- -
-
-getOrDefault(param)
-

Gets the value of a param in the user-supplied param map or its -default value. Raises an error if neither is set.

-
- -
-
-getOutputCol()
-

Gets output column name of annotations.

-
- -
-
-getParam(paramName)
-

Gets a param by its name.

-
- -
-
-getParamValue(paramName)
-

Gets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-hasDefault(param)
-

Checks whether a param has a default value.

-
- -
-
-hasParam(paramName)
-

Tests whether this instance contains a param with a given -(string) name.

-
- -
-
-isDefined(param)
-

Checks whether a param is explicitly set by user or has -a default value.

-
- -
-
-isSet(param)
-

Checks whether a param is explicitly set by user.

-
- -
-
-classmethod load(path)
-

Reads an ML instance from the input path, a shortcut of read().load(path).

-
- -
-
-property params
-

Returns all params ordered by name. The default implementation -uses dir() to get all attributes of type -Param.

-
- -
-
-classmethod read()
-

Returns an MLReader instance for this class.

-
- -
-
-save(path)
-

Save this ML instance to the given path, a shortcut of ‘write().save(path)’.

-
- -
-
-set(param, value)
-

Sets a parameter in the embedded param map.

-
- -
-
-setInputCols(*value)
-

Sets column names of input annotations.

-
-
Parameters
-
-
*valuestr

Input columns for the annotator

-
-
-
-
-
- -
-
-setLazyAnnotator(value)
-

Sets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
-
Parameters
-
-
valuebool

Whether Annotator should be evaluated lazily in a -RecursivePipeline

-
-
-
-
-
- -
-
-setOutputCol(value)
-

Sets output column name of annotations.

-
-
Parameters
-
-
valuestr

Name of output column

-
-
-
-
-
- -
-
-setParamValue(paramName)
-

Sets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-transform(dataset, params=None)
-

Transforms the input dataset with optional parameters.

-
-

New in version 1.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset

-
-
paramsdict, optional

an optional param map that overrides embedded params.

-
-
-
-
Returns
-
-
pyspark.sql.DataFrame

transformed dataset

-
-
-
-
-
- -
-
-uid
-

A unique id for the object.

-
- -
-
-write()
-

Returns an MLWriter instance for this ML instance.

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.BertSentenceChunkEmbeddings.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.BertSentenceChunkEmbeddings.html deleted file mode 100644 index a93af7e1e3..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.BertSentenceChunkEmbeddings.html +++ /dev/null @@ -1,1351 +0,0 @@ - - - - - - - - sparknlp_jsl.annotator.BertSentenceChunkEmbeddings — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.annotator.BertSentenceChunkEmbeddings

-
-
-class sparknlp_jsl.annotator.BertSentenceChunkEmbeddings(classname='com.johnsnowlabs.nlp.embeddings.BertSentenceChunkEmbeddings', java_model=None)[source]
-

Bases: sparknlp.annotator.BertSentenceEmbeddings

-

BERT Sentence embeddings for chunk annotations which take into account the context of the sentence the chunk appeared in. -This is an extension of BertSentenceEmbeddings which combines the embedding of a chunk with the embedding of the -surrounding sentence. For each input chunk annotation, it finds the corresponding sentence, computes -the BERT sentence embedding of both the chunk and the sentence and averages them. The resulting embeddings are -useful in cases, in which one needs a numerical representation of a text chunk which is sensitive to -the context it appears in.

- ---- - - - - - - - - - - -

Input Annotation types

Output Annotation type

DOCUMENT, CHUNK

SENTENCE_EMBEDDINGS

-
-
Parameters
-
-
chunkWeight

Relative weight of chunk embeddings in comparison to sentence embeddings. The value should between 0 and 1. -The default is 0.5, which means the chunk and sentence embeddings are given equal weight.

-
-
-
-
-

Examples

-
>>> import sparknlp
->>> from sparknlp.base import *
->>> from sparknlp.common import *
->>> from sparknlp.annotator import *
->>> from sparknlp.training import *
->>> import sparknlp_jsl
->>> from sparknlp_jsl.base import *
->>> from sparknlp_jsl.annotator import *
->>> from pyspark.ml import Pipeline
-
-
-

First extract the prerequisites for the NerDLModel

-
>>> documentAssembler = DocumentAssembler() \
-...     .setInputCol("text") \
-...     .setOutputCol("document")
->>> sentence = SentenceDetector() \
-...     .setInputCols(["document"]) \
-...     .setOutputCol("sentence")
->>> tokenizer = Tokenizer() \
-...     .setInputCols(["sentence"]) \
-...     .setOutputCol("token")
->>> embeddings = WordEmbeddingsModel.pretrained() \
-...     .setInputCols(["sentence", "token"]) \
-...     .setOutputCol("bert")
->>> nerTagger = MedicalNerDLModel.pretrained() \
-...     .setInputCols(["sentence", "token", "bert"]) \
-...     .setOutputCol("ner")
->>> nerConverter = NerConverter() \
-...     .setInputCols(["sentence", "token","ner"]) \
-...     .setOutputCol("ner_chunk")
->>> embeddings = BertSentenceChunkEmbeddings.pretrained("sbluebert_base_uncased_mli", "en", "clinical/models") \
-...     .setInputCols(["sentence", "ner_chunk"]) \
-...     .setOutputCol("sentence_chunk_embeddings")
->>> pipeline = Pipeline().setStages([
-...     documentAssembler,
-...     sentence,
-...     tokenizer,
-...     embeddings,
-...     nerTagger,
-...     nerConverter
-...     embeddings
-... ])
->>> data = spark.createDataFrame([["Her Diabetes has become type 2 in the last year with her Diabetes.He complains of swelling in his right forearm."]]).toDF("text")
->>> result = pipeline.fit(data).transform(data)
->>> result
-...   .selectExpr("explode(sentence_chunk_embeddings) AS s")
-...   .selectExpr("s.result", "slice(s.embeddings, 1, 5) AS averageEmbedding")
-...   .show(truncate=false)
-+-----------------------------+-----------------------------------------------------------------+
-|                       result|                                                 averageEmbedding|
-+-----------------------------+-----------------------------------------------------------------+
-|Her Diabetes                 |[-0.31995273, -0.04710883, -0.28973156, -0.1294758, 0.12481072]  |
-|type 2                       |[-0.027161136, -0.24613449, -0.0949309, 0.1825444, -0.2252143]   |
-|her Diabetes                 |[-0.31995273, -0.04710883, -0.28973156, -0.1294758, 0.12481072]  |
-|swelling in his right forearm|[-0.45139068, 0.12400375, -0.0075617577, -0.90806055, 0.12871636]|
-+-----------------------------+-----------------------------------------------------------------+
-
-
-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__([classname, java_model])

Initialize this instance with a Java model object.

clear(param)

Clears a param from the param map if it has been explicitly set.

copy([extra])

Creates a copy of this instance with the same uid and some extra params.

explainParam(param)

Explains a single param and returns its name, doc, and optional default value and user-supplied value in a string.

explainParams()

Returns the documentation of all params with their optionally default values and user-supplied values.

extractParamMap([extra])

Extracts the embedded default param values and user-supplied values, and then merges them with extra values from input into a flat param map, where the latter value is used if there exist conflicts, i.e., with ordering: default param values < user-supplied values < extra.

getBatchSize()

Gets current batch size.

getCaseSensitive()

Gets whether to ignore case in tokens for embeddings matching.

getDimension()

Gets embeddings dimension.

getInputCols()

Gets current column names of input annotations.

getLazyAnnotator()

Gets whether Annotator should be evaluated lazily in a RecursivePipeline.

getOrDefault(param)

Gets the value of a param in the user-supplied param map or its default value.

getOutputCol()

Gets output column name of annotations.

getParam(paramName)

Gets a param by its name.

getParamValue(paramName)

Gets the value of a parameter.

getStorageRef()

Gets unique reference name for identification.

hasDefault(param)

Checks whether a param has a default value.

hasParam(paramName)

Tests whether this instance contains a param with a given (string) name.

isDefined(param)

Checks whether a param is explicitly set by user or has a default value.

isSet(param)

Checks whether a param is explicitly set by user.

load(path)

Reads an ML instance from the input path, a shortcut of read().load(path).

loadSavedModel(folder, spark_session)

Loads a locally saved model.

pretrained([name, lang, remote_loc])

Downloads and loads a pretrained model.

read()

Returns an MLReader instance for this class.

save(path)

Save this ML instance to the given path, a shortcut of 'write().save(path)'.

set(param, value)

Sets a parameter in the embedded param map.

setBatchSize(v)

Sets batch size.

setCaseSensitive(value)

Sets whether to ignore case in tokens for embeddings matching.

setChunkWeight(value)

Sets the relative weight of chunk embeddings in comparison to sentence embeddings. The value should between 0 and 1.

setConfigProtoBytes(b)

Sets configProto from tensorflow, serialized into byte array.

setDimension(value)

Sets embeddings dimension.

setInputCols(*value)

Sets column names of input annotations.

setIsLong(value)

Sets whether to use Long type instead of Int type for inputs buffer.

setLazyAnnotator(value)

Sets whether Annotator should be evaluated lazily in a RecursivePipeline.

setMaxSentenceLength(value)

Sets max sentence length to process.

setOutputCol(value)

Sets output column name of annotations.

setParamValue(paramName)

Sets the value of a parameter.

setParams()

setStorageRef(value)

Sets unique reference name for identification.

transform(dataset[, params])

Transforms the input dataset with optional parameters.

write()

Returns an MLWriter instance for this ML instance.

-

Attributes

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

batchSize

caseSensitive

chunkWeight

configProtoBytes

dimension

getter_attrs

inputCols

isLong

lazyAnnotator

maxSentenceLength

name

outputCol

params

Returns all params ordered by name.

storageRef

-
-
-clear(param)
-

Clears a param from the param map if it has been explicitly set.

-
- -
-
-copy(extra=None)
-

Creates a copy of this instance with the same uid and some -extra params. This implementation first calls Params.copy and -then make a copy of the companion Java pipeline component with -extra params. So both the Python wrapper and the Java pipeline -component get copied.

-
-
Parameters
-
-
extradict, optional

Extra parameters to copy to the new instance

-
-
-
-
Returns
-
-
JavaParams

Copy of this instance

-
-
-
-
-
- -
-
-explainParam(param)
-

Explains a single param and returns its name, doc, and optional -default value and user-supplied value in a string.

-
- -
-
-explainParams()
-

Returns the documentation of all params with their optionally -default values and user-supplied values.

-
- -
-
-extractParamMap(extra=None)
-

Extracts the embedded default param values and user-supplied -values, and then merges them with extra values from input into -a flat param map, where the latter value is used if there exist -conflicts, i.e., with ordering: default param values < -user-supplied values < extra.

-
-
Parameters
-
-
extradict, optional

extra param values

-
-
-
-
Returns
-
-
dict

merged param map

-
-
-
-
-
- -
-
-getBatchSize()
-

Gets current batch size.

-
-
Returns
-
-
int

Current batch size

-
-
-
-
-
- -
-
-getCaseSensitive()
-

Gets whether to ignore case in tokens for embeddings matching.

-
-
Returns
-
-
bool

Whether to ignore case in tokens for embeddings matching

-
-
-
-
-
- -
-
-getDimension()
-

Gets embeddings dimension.

-
- -
-
-getInputCols()
-

Gets current column names of input annotations.

-
- -
-
-getLazyAnnotator()
-

Gets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
- -
-
-getOrDefault(param)
-

Gets the value of a param in the user-supplied param map or its -default value. Raises an error if neither is set.

-
- -
-
-getOutputCol()
-

Gets output column name of annotations.

-
- -
-
-getParam(paramName)
-

Gets a param by its name.

-
- -
-
-getParamValue(paramName)
-

Gets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-getStorageRef()
-

Gets unique reference name for identification.

-
-
Returns
-
-
str

Unique reference name for identification

-
-
-
-
-
- -
-
-hasDefault(param)
-

Checks whether a param has a default value.

-
- -
-
-hasParam(paramName)
-

Tests whether this instance contains a param with a given -(string) name.

-
- -
-
-isDefined(param)
-

Checks whether a param is explicitly set by user or has -a default value.

-
- -
-
-isSet(param)
-

Checks whether a param is explicitly set by user.

-
- -
-
-classmethod load(path)
-

Reads an ML instance from the input path, a shortcut of read().load(path).

-
- -
-
-static loadSavedModel(folder, spark_session)
-

Loads a locally saved model.

-
-
Parameters
-
-
folderstr

Folder of the saved model

-
-
spark_sessionpyspark.sql.SparkSession

The current SparkSession

-
-
-
-
Returns
-
-
BertSentenceEmbeddings

The restored model

-
-
-
-
-
- -
-
-property params
-

Returns all params ordered by name. The default implementation -uses dir() to get all attributes of type -Param.

-
- -
-
-static pretrained(name='sent_small_bert_L2_768', lang='en', remote_loc=None)[source]
-

Downloads and loads a pretrained model.

-
-
Parameters
-
-
namestr, optional

Name of the pretrained model, by default “sent_small_bert_L2_768”

-
-
langstr, optional

Language of the pretrained model, by default “en”

-
-
remote_locstr, optional

Optional remote address of the resource, by default None. Will use -Spark NLPs repositories otherwise.

-
-
-
-
Returns
-
-
BertSentenceEmbeddings

The restored model

-
-
-
-
-
- -
-
-classmethod read()
-

Returns an MLReader instance for this class.

-
- -
-
-save(path)
-

Save this ML instance to the given path, a shortcut of ‘write().save(path)’.

-
- -
-
-set(param, value)
-

Sets a parameter in the embedded param map.

-
- -
-
-setBatchSize(v)
-

Sets batch size.

-
-
Parameters
-
-
vint

Batch size

-
-
-
-
-
- -
-
-setCaseSensitive(value)
-

Sets whether to ignore case in tokens for embeddings matching.

-
-
Parameters
-
-
valuebool

Whether to ignore case in tokens for embeddings matching

-
-
-
-
-
- -
-
-setChunkWeight(value)[source]
-
-
Sets the relative weight of chunk embeddings in comparison to sentence embeddings. The value should between 0 and 1.

The default is 0.5, which means the chunk and sentence embeddings are given equal weight.

-
-
-
-
Parameters
-
-
valuefloat

Relative weight of chunk embeddings in comparison to sentence embeddings. The value should between 0 and 1. -The default is 0.5, which means the chunk and sentence embeddings are given equal weight.

-
-
-
-
-
- -
-
-setConfigProtoBytes(b)
-

Sets configProto from tensorflow, serialized into byte array.

-
-
Parameters
-
-
bList[str]

ConfigProto from tensorflow, serialized into byte array

-
-
-
-
-
- -
-
-setDimension(value)
-

Sets embeddings dimension.

-
-
Parameters
-
-
valueint

Embeddings dimension

-
-
-
-
-
- -
-
-setInputCols(*value)
-

Sets column names of input annotations.

-
-
Parameters
-
-
*valuestr

Input columns for the annotator

-
-
-
-
-
- -
-
-setIsLong(value)
-

Sets whether to use Long type instead of Int type for inputs buffer.

-

Some Bert models require Long instead of Int.

-
-
Parameters
-
-
valuebool

Whether to use Long type instead of Int type for inputs buffer

-
-
-
-
-
- -
-
-setLazyAnnotator(value)
-

Sets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
-
Parameters
-
-
valuebool

Whether Annotator should be evaluated lazily in a -RecursivePipeline

-
-
-
-
-
- -
-
-setMaxSentenceLength(value)
-

Sets max sentence length to process.

-
-
Parameters
-
-
valueint

Max sentence length to process

-
-
-
-
-
- -
-
-setOutputCol(value)
-

Sets output column name of annotations.

-
-
Parameters
-
-
valuestr

Name of output column

-
-
-
-
-
- -
-
-setParamValue(paramName)
-

Sets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-setStorageRef(value)
-

Sets unique reference name for identification.

-
-
Parameters
-
-
valuestr

Unique reference name for identification

-
-
-
-
-
- -
-
-transform(dataset, params=None)
-

Transforms the input dataset with optional parameters.

-
-

New in version 1.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset

-
-
paramsdict, optional

an optional param map that overrides embedded params.

-
-
-
-
Returns
-
-
pyspark.sql.DataFrame

transformed dataset

-
-
-
-
-
- -
-
-uid
-

A unique id for the object.

-
- -
-
-write()
-

Returns an MLWriter instance for this ML instance.

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.Chunk2Token.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.Chunk2Token.html deleted file mode 100644 index a94e94ab78..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.Chunk2Token.html +++ /dev/null @@ -1,983 +0,0 @@ - - - - - - - - sparknlp_jsl.annotator.Chunk2Token — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.annotator.Chunk2Token

-
-
-class sparknlp_jsl.annotator.Chunk2Token(classname='com.johnsnowlabs.nlp.annotators.Chunk2Token', java_model=None)[source]
-

Bases: sparknlp.common.AnnotatorModel

-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__([classname, java_model])

Initialize this instance with a Java model object.

clear(param)

Clears a param from the param map if it has been explicitly set.

copy([extra])

Creates a copy of this instance with the same uid and some extra params.

explainParam(param)

Explains a single param and returns its name, doc, and optional default value and user-supplied value in a string.

explainParams()

Returns the documentation of all params with their optionally default values and user-supplied values.

extractParamMap([extra])

Extracts the embedded default param values and user-supplied values, and then merges them with extra values from input into a flat param map, where the latter value is used if there exist conflicts, i.e., with ordering: default param values < user-supplied values < extra.

getInputCols()

Gets current column names of input annotations.

getLazyAnnotator()

Gets whether Annotator should be evaluated lazily in a RecursivePipeline.

getOrDefault(param)

Gets the value of a param in the user-supplied param map or its default value.

getOutputCol()

Gets output column name of annotations.

getParam(paramName)

Gets a param by its name.

getParamValue(paramName)

Gets the value of a parameter.

hasDefault(param)

Checks whether a param has a default value.

hasParam(paramName)

Tests whether this instance contains a param with a given (string) name.

isDefined(param)

Checks whether a param is explicitly set by user or has a default value.

isSet(param)

Checks whether a param is explicitly set by user.

load(path)

Reads an ML instance from the input path, a shortcut of read().load(path).

read()

Returns an MLReader instance for this class.

save(path)

Save this ML instance to the given path, a shortcut of 'write().save(path)'.

set(param, value)

Sets a parameter in the embedded param map.

setInputCols(*value)

Sets column names of input annotations.

setLazyAnnotator(value)

Sets whether Annotator should be evaluated lazily in a RecursivePipeline.

setOutputCol(value)

Sets output column name of annotations.

setParamValue(paramName)

Sets the value of a parameter.

setParams()

transform(dataset[, params])

Transforms the input dataset with optional parameters.

write()

Returns an MLWriter instance for this ML instance.

-

Attributes

- ---- - - - - - - - - - - - - - - - - - - - - -

getter_attrs

inputCols

lazyAnnotator

name

outputCol

params

Returns all params ordered by name.

-
-
-clear(param)
-

Clears a param from the param map if it has been explicitly set.

-
- -
-
-copy(extra=None)
-

Creates a copy of this instance with the same uid and some -extra params. This implementation first calls Params.copy and -then make a copy of the companion Java pipeline component with -extra params. So both the Python wrapper and the Java pipeline -component get copied.

-
-
Parameters
-
-
extradict, optional

Extra parameters to copy to the new instance

-
-
-
-
Returns
-
-
JavaParams

Copy of this instance

-
-
-
-
-
- -
-
-explainParam(param)
-

Explains a single param and returns its name, doc, and optional -default value and user-supplied value in a string.

-
- -
-
-explainParams()
-

Returns the documentation of all params with their optionally -default values and user-supplied values.

-
- -
-
-extractParamMap(extra=None)
-

Extracts the embedded default param values and user-supplied -values, and then merges them with extra values from input into -a flat param map, where the latter value is used if there exist -conflicts, i.e., with ordering: default param values < -user-supplied values < extra.

-
-
Parameters
-
-
extradict, optional

extra param values

-
-
-
-
Returns
-
-
dict

merged param map

-
-
-
-
-
- -
-
-getInputCols()
-

Gets current column names of input annotations.

-
- -
-
-getLazyAnnotator()
-

Gets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
- -
-
-getOrDefault(param)
-

Gets the value of a param in the user-supplied param map or its -default value. Raises an error if neither is set.

-
- -
-
-getOutputCol()
-

Gets output column name of annotations.

-
- -
-
-getParam(paramName)
-

Gets a param by its name.

-
- -
-
-getParamValue(paramName)
-

Gets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-hasDefault(param)
-

Checks whether a param has a default value.

-
- -
-
-hasParam(paramName)
-

Tests whether this instance contains a param with a given -(string) name.

-
- -
-
-isDefined(param)
-

Checks whether a param is explicitly set by user or has -a default value.

-
- -
-
-isSet(param)
-

Checks whether a param is explicitly set by user.

-
- -
-
-classmethod load(path)
-

Reads an ML instance from the input path, a shortcut of read().load(path).

-
- -
-
-property params
-

Returns all params ordered by name. The default implementation -uses dir() to get all attributes of type -Param.

-
- -
-
-classmethod read()
-

Returns an MLReader instance for this class.

-
- -
-
-save(path)
-

Save this ML instance to the given path, a shortcut of ‘write().save(path)’.

-
- -
-
-set(param, value)
-

Sets a parameter in the embedded param map.

-
- -
-
-setInputCols(*value)
-

Sets column names of input annotations.

-
-
Parameters
-
-
*valuestr

Input columns for the annotator

-
-
-
-
-
- -
-
-setLazyAnnotator(value)
-

Sets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
-
Parameters
-
-
valuebool

Whether Annotator should be evaluated lazily in a -RecursivePipeline

-
-
-
-
-
- -
-
-setOutputCol(value)
-

Sets output column name of annotations.

-
-
Parameters
-
-
valuestr

Name of output column

-
-
-
-
-
- -
-
-setParamValue(paramName)
-

Sets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-transform(dataset, params=None)
-

Transforms the input dataset with optional parameters.

-
-

New in version 1.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset

-
-
paramsdict, optional

an optional param map that overrides embedded params.

-
-
-
-
Returns
-
-
pyspark.sql.DataFrame

transformed dataset

-
-
-
-
-
- -
-
-uid
-

A unique id for the object.

-
- -
-
-write()
-

Returns an MLWriter instance for this ML instance.

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.ChunkConverter.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.ChunkConverter.html deleted file mode 100644 index a1233a362d..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.ChunkConverter.html +++ /dev/null @@ -1,1020 +0,0 @@ - - - - - - - - sparknlp_jsl.annotator.ChunkConverter — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.annotator.ChunkConverter

-
-
-class sparknlp_jsl.annotator.ChunkConverter(classname='com.johnsnowlabs.nlp.annotators.chunker.ChunkConverter', java_model=None)[source]
-

Bases: sparknlp.common.AnnotatorModel

-

Convert chunks from regexMatcher to chunks with a entity in the metadata. -Use the identifier or field as a entity.

-

Examples

- ---- - - - - - - - - - - -

Input Annotation types

Output Annotation type

CHUNK

CHUNK

-
>>> test_data = spark.createDataFrame([
-...    (1,"My first sentence with the first rule. This is my second sentence with ceremonies rule."),
-...    ]).toDF("id", "text")
->>> document_assembler = DocumentAssembler().setInputCol('text').setOutputCol('document')
->>> sentence_detector = SentenceDetector().setInputCols(["document"]).setOutputCol("sentence")
->>> regex_matcher = RegexMatcher()    ...    .setInputCols("sentence")    ...    .setOutputCol("regex")    ...    .setExternalRules(path="../src/test/resources/regex-matcher/rules.txt",delimiter=",")
->>> chunkConverter = ChunkConverter().setInputCols("regex").setOutputCol("chunk")
->>> pipeline = Pipeline(stages=[document_assembler, sentence_detector, regex_matcher, regex_matcher,chunkConverter])
->>> model = pipeline.fit(test_data)
->>> outdf = model.transform(test_data)
-+------------------------------------------------------------------------------------------------+
-|col                                                                                             |
-+------------------------------------------------------------------------------------------------+
-|[chunk, 23, 31, the first, [identifier -> NAME, sentence -> 0, chunk -> 0, entity -> NAME], []] |
-|[chunk, 71, 80, ceremonies, [identifier -> NAME, sentence -> 1, chunk -> 0, entity -> NAME], []]|
-+------------------------------------------------------------------------------------------------+
-
-
-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__([classname, java_model])

Initialize this instance with a Java model object.

clear(param)

Clears a param from the param map if it has been explicitly set.

copy([extra])

Creates a copy of this instance with the same uid and some extra params.

explainParam(param)

Explains a single param and returns its name, doc, and optional default value and user-supplied value in a string.

explainParams()

Returns the documentation of all params with their optionally default values and user-supplied values.

extractParamMap([extra])

Extracts the embedded default param values and user-supplied values, and then merges them with extra values from input into a flat param map, where the latter value is used if there exist conflicts, i.e., with ordering: default param values < user-supplied values < extra.

getInputCols()

Gets current column names of input annotations.

getLazyAnnotator()

Gets whether Annotator should be evaluated lazily in a RecursivePipeline.

getOrDefault(param)

Gets the value of a param in the user-supplied param map or its default value.

getOutputCol()

Gets output column name of annotations.

getParam(paramName)

Gets a param by its name.

getParamValue(paramName)

Gets the value of a parameter.

hasDefault(param)

Checks whether a param has a default value.

hasParam(paramName)

Tests whether this instance contains a param with a given (string) name.

isDefined(param)

Checks whether a param is explicitly set by user or has a default value.

isSet(param)

Checks whether a param is explicitly set by user.

load(path)

Reads an ML instance from the input path, a shortcut of read().load(path).

read()

Returns an MLReader instance for this class.

save(path)

Save this ML instance to the given path, a shortcut of 'write().save(path)'.

set(param, value)

Sets a parameter in the embedded param map.

setInputCols(*value)

Sets column names of input annotations.

setLazyAnnotator(value)

Sets whether Annotator should be evaluated lazily in a RecursivePipeline.

setOutputCol(value)

Sets output column name of annotations.

setParamValue(paramName)

Sets the value of a parameter.

setParams()

transform(dataset[, params])

Transforms the input dataset with optional parameters.

write()

Returns an MLWriter instance for this ML instance.

-

Attributes

- ---- - - - - - - - - - - - - - - - - - - - - -

getter_attrs

inputCols

lazyAnnotator

name

outputCol

params

Returns all params ordered by name.

-
-
-clear(param)
-

Clears a param from the param map if it has been explicitly set.

-
- -
-
-copy(extra=None)
-

Creates a copy of this instance with the same uid and some -extra params. This implementation first calls Params.copy and -then make a copy of the companion Java pipeline component with -extra params. So both the Python wrapper and the Java pipeline -component get copied.

-
-
Parameters
-
-
extradict, optional

Extra parameters to copy to the new instance

-
-
-
-
Returns
-
-
JavaParams

Copy of this instance

-
-
-
-
-
- -
-
-explainParam(param)
-

Explains a single param and returns its name, doc, and optional -default value and user-supplied value in a string.

-
- -
-
-explainParams()
-

Returns the documentation of all params with their optionally -default values and user-supplied values.

-
- -
-
-extractParamMap(extra=None)
-

Extracts the embedded default param values and user-supplied -values, and then merges them with extra values from input into -a flat param map, where the latter value is used if there exist -conflicts, i.e., with ordering: default param values < -user-supplied values < extra.

-
-
Parameters
-
-
extradict, optional

extra param values

-
-
-
-
Returns
-
-
dict

merged param map

-
-
-
-
-
- -
-
-getInputCols()
-

Gets current column names of input annotations.

-
- -
-
-getLazyAnnotator()
-

Gets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
- -
-
-getOrDefault(param)
-

Gets the value of a param in the user-supplied param map or its -default value. Raises an error if neither is set.

-
- -
-
-getOutputCol()
-

Gets output column name of annotations.

-
- -
-
-getParam(paramName)
-

Gets a param by its name.

-
- -
-
-getParamValue(paramName)
-

Gets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-hasDefault(param)
-

Checks whether a param has a default value.

-
- -
-
-hasParam(paramName)
-

Tests whether this instance contains a param with a given -(string) name.

-
- -
-
-isDefined(param)
-

Checks whether a param is explicitly set by user or has -a default value.

-
- -
-
-isSet(param)
-

Checks whether a param is explicitly set by user.

-
- -
-
-classmethod load(path)
-

Reads an ML instance from the input path, a shortcut of read().load(path).

-
- -
-
-property params
-

Returns all params ordered by name. The default implementation -uses dir() to get all attributes of type -Param.

-
- -
-
-classmethod read()
-

Returns an MLReader instance for this class.

-
- -
-
-save(path)
-

Save this ML instance to the given path, a shortcut of ‘write().save(path)’.

-
- -
-
-set(param, value)
-

Sets a parameter in the embedded param map.

-
- -
-
-setInputCols(*value)
-

Sets column names of input annotations.

-
-
Parameters
-
-
*valuestr

Input columns for the annotator

-
-
-
-
-
- -
-
-setLazyAnnotator(value)
-

Sets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
-
Parameters
-
-
valuebool

Whether Annotator should be evaluated lazily in a -RecursivePipeline

-
-
-
-
-
- -
-
-setOutputCol(value)
-

Sets output column name of annotations.

-
-
Parameters
-
-
valuestr

Name of output column

-
-
-
-
-
- -
-
-setParamValue(paramName)
-

Sets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-transform(dataset, params=None)
-

Transforms the input dataset with optional parameters.

-
-

New in version 1.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset

-
-
paramsdict, optional

an optional param map that overrides embedded params.

-
-
-
-
Returns
-
-
pyspark.sql.DataFrame

transformed dataset

-
-
-
-
-
- -
-
-uid
-

A unique id for the object.

-
- -
-
-write()
-

Returns an MLWriter instance for this ML instance.

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.ChunkFilterer.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.ChunkFilterer.html deleted file mode 100644 index b749d0cce8..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.ChunkFilterer.html +++ /dev/null @@ -1,1097 +0,0 @@ - - - - - - - - sparknlp_jsl.annotator.ChunkFilterer — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.annotator.ChunkFilterer

-
-
-class sparknlp_jsl.annotator.ChunkFilterer(classname='com.johnsnowlabs.nlp.annotators.chunker.ChunkFilterer', java_model=None)[source]
-

Bases: sparknlp.common.AnnotatorModel

-
-
Model that Filters entities coming from CHUNK annotations. Filters can be set via a white list of terms or a regular expression.

White list criteria is enabled by default. To use regex, criteria has to be set to regex.This model was trained using the ChunkFiltererApproach -and has embeded the list of pairs (entity,confidenceThreshold).

-
-
- ---- - - - - - - - - - - -

Input Annotation types

Output Annotation type

DOCUMENT, CHUNK, ASSERTION

CHUNK

-
-
Parameters
-
-
whiteList

If defined, list of entities to process. The rest will be ignored

-
-
regex

If defined, list of entities to process. The rest will be ignored.

-
-
criteria
-
Tag representing what is the criteria to filter the chunks. possibles values (assertion|isIn|regex)

isIn : Filter by the chunk -regex : Filter using a regex

-
-
-
-
-
-
-

Examples

-
>>> import sparknlp
->>> from sparknlp.base import *
->>> from sparknlp.common import *
->>> from sparknlp.annotator import *
->>> from sparknlp.training import *
->>> import sparknlp_jsl
->>> from sparknlp_jsl.base import *
->>> from sparknlp_jsl.annotator import *
->>> from pyspark.ml import Pipeline
->>> data = spark.createDataFrame([["Has a past history of gastroenteritis and stomach pain, however patient ..."]]).toDF("text")
->>> docAssembler = DocumentAssembler().setInputCol("text").setOutputCol("document")
->>> sentenceDetector = SentenceDetector().setInputCols(["document"]).setOutputCol("sentence")
->>> tokenizer = Tokenizer().setInputCols(["sentence"]).setOutputCol("token")
->>> posTagger = PerceptronModel.pretrained()        ...    .setInputCols(["sentence", "token"])        ...    .setOutputCol("pos")
->>> chunker = Chunker()        ...   .setInputCols(["pos", "sentence"])        ...   .setOutputCol("chunk")        ...   .setRegexParsers(["(<NN>)+"])
-...
->>> chunkerFilter = ChunkFilterer()        ...   .setInputCols(["sentence","chunk"])        ...   .setOutputCol("filtered")        ...   .setCriteria("isin")        ...   .setWhiteList(["gastroenteritis"])
-...
->>> pipeline = Pipeline(stages=[
-...   docAssembler,
-...   sentenceDetector,
-...   tokenizer,
-...   posTagger,
-...   chunker,
-...   chunkerFilter])
-...
->>> result = pipeline.fit(data).transform(data)
->>> result.selectExpr("explode(chunk)").show(truncate=False)
-
-
-
>>> result.selectExpr("explode(chunk)").show(truncate=False)
-+---------------------------------------------------------------------------------+
-|col                                                                              |
-+---------------------------------------------------------------------------------+
-|{chunk, 11, 17, history, {sentence -> 0, chunk -> 0}, []}                        |
-|{chunk, 22, 36, gastroenteritis, {sentence -> 0, chunk -> 1}, []}                |
-|{chunk, 42, 53, stomach pain, {sentence -> 0, chunk -> 2}, []}                   |
-|{chunk, 64, 70, patient, {sentence -> 0, chunk -> 3}, []}                        |
-|{chunk, 81, 110, stomach pain now.We don't care, {sentence -> 0, chunk -> 4}, []}|
-|{chunk, 118, 132, gastroenteritis, {sentence -> 0, chunk -> 5}, []}              |
-+---------------------------------------------------------------------------------+
-
-
-
>>> result.selectExpr("explode(filtered)").show(truncate=False)
-+-------------------------------------------------------------------+
-|col                                                                |
-+-------------------------------------------------------------------+
-|{chunk, 22, 36, gastroenteritis, {sentence -> 0, chunk -> 1}, []}  |
-|{chunk, 118, 132, gastroenteritis, {sentence -> 0, chunk -> 5}, []}|
-+-------------------------------------------------------------------+
-
-
-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__([classname, java_model])

Initialize this instance with a Java model object.

clear(param)

Clears a param from the param map if it has been explicitly set.

copy([extra])

Creates a copy of this instance with the same uid and some extra params.

explainParam(param)

Explains a single param and returns its name, doc, and optional default value and user-supplied value in a string.

explainParams()

Returns the documentation of all params with their optionally default values and user-supplied values.

extractParamMap([extra])

Extracts the embedded default param values and user-supplied values, and then merges them with extra values from input into a flat param map, where the latter value is used if there exist conflicts, i.e., with ordering: default param values < user-supplied values < extra.

getInputCols()

Gets current column names of input annotations.

getLazyAnnotator()

Gets whether Annotator should be evaluated lazily in a RecursivePipeline.

getOrDefault(param)

Gets the value of a param in the user-supplied param map or its default value.

getOutputCol()

Gets output column name of annotations.

getParam(paramName)

Gets a param by its name.

getParamValue(paramName)

Gets the value of a parameter.

hasDefault(param)

Checks whether a param has a default value.

hasParam(paramName)

Tests whether this instance contains a param with a given (string) name.

isDefined(param)

Checks whether a param is explicitly set by user or has a default value.

isSet(param)

Checks whether a param is explicitly set by user.

load(path)

Reads an ML instance from the input path, a shortcut of read().load(path).

read()

Returns an MLReader instance for this class.

save(path)

Save this ML instance to the given path, a shortcut of 'write().save(path)'.

set(param, value)

Sets a parameter in the embedded param map.

setCriteria(s)

setFilterEntity(s)

setInputCols(*value)

Sets column names of input annotations.

setLazyAnnotator(value)

Sets whether Annotator should be evaluated lazily in a RecursivePipeline.

setOutputCol(value)

Sets output column name of annotations.

setParamValue(paramName)

Sets the value of a parameter.

setParams()

setRegex(value)

setWhiteList(value)

transform(dataset[, params])

Transforms the input dataset with optional parameters.

write()

Returns an MLWriter instance for this ML instance.

-

Attributes

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

criteria

filterValue

getter_attrs

inputCols

lazyAnnotator

name

outputCol

params

Returns all params ordered by name.

regex

whiteList

-
-
-clear(param)
-

Clears a param from the param map if it has been explicitly set.

-
- -
-
-copy(extra=None)
-

Creates a copy of this instance with the same uid and some -extra params. This implementation first calls Params.copy and -then make a copy of the companion Java pipeline component with -extra params. So both the Python wrapper and the Java pipeline -component get copied.

-
-
Parameters
-
-
extradict, optional

Extra parameters to copy to the new instance

-
-
-
-
Returns
-
-
JavaParams

Copy of this instance

-
-
-
-
-
- -
-
-explainParam(param)
-

Explains a single param and returns its name, doc, and optional -default value and user-supplied value in a string.

-
- -
-
-explainParams()
-

Returns the documentation of all params with their optionally -default values and user-supplied values.

-
- -
-
-extractParamMap(extra=None)
-

Extracts the embedded default param values and user-supplied -values, and then merges them with extra values from input into -a flat param map, where the latter value is used if there exist -conflicts, i.e., with ordering: default param values < -user-supplied values < extra.

-
-
Parameters
-
-
extradict, optional

extra param values

-
-
-
-
Returns
-
-
dict

merged param map

-
-
-
-
-
- -
-
-getInputCols()
-

Gets current column names of input annotations.

-
- -
-
-getLazyAnnotator()
-

Gets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
- -
-
-getOrDefault(param)
-

Gets the value of a param in the user-supplied param map or its -default value. Raises an error if neither is set.

-
- -
-
-getOutputCol()
-

Gets output column name of annotations.

-
- -
-
-getParam(paramName)
-

Gets a param by its name.

-
- -
-
-getParamValue(paramName)
-

Gets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-hasDefault(param)
-

Checks whether a param has a default value.

-
- -
-
-hasParam(paramName)
-

Tests whether this instance contains a param with a given -(string) name.

-
- -
-
-isDefined(param)
-

Checks whether a param is explicitly set by user or has -a default value.

-
- -
-
-isSet(param)
-

Checks whether a param is explicitly set by user.

-
- -
-
-classmethod load(path)
-

Reads an ML instance from the input path, a shortcut of read().load(path).

-
- -
-
-property params
-

Returns all params ordered by name. The default implementation -uses dir() to get all attributes of type -Param.

-
- -
-
-classmethod read()
-

Returns an MLReader instance for this class.

-
- -
-
-save(path)
-

Save this ML instance to the given path, a shortcut of ‘write().save(path)’.

-
- -
-
-set(param, value)
-

Sets a parameter in the embedded param map.

-
- -
-
-setInputCols(*value)
-

Sets column names of input annotations.

-
-
Parameters
-
-
*valuestr

Input columns for the annotator

-
-
-
-
-
- -
-
-setLazyAnnotator(value)
-

Sets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
-
Parameters
-
-
valuebool

Whether Annotator should be evaluated lazily in a -RecursivePipeline

-
-
-
-
-
- -
-
-setOutputCol(value)
-

Sets output column name of annotations.

-
-
Parameters
-
-
valuestr

Name of output column

-
-
-
-
-
- -
-
-setParamValue(paramName)
-

Sets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-transform(dataset, params=None)
-

Transforms the input dataset with optional parameters.

-
-

New in version 1.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset

-
-
paramsdict, optional

an optional param map that overrides embedded params.

-
-
-
-
Returns
-
-
pyspark.sql.DataFrame

transformed dataset

-
-
-
-
-
- -
-
-uid
-

A unique id for the object.

-
- -
-
-write()
-

Returns an MLWriter instance for this ML instance.

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.ChunkFiltererApproach.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.ChunkFiltererApproach.html deleted file mode 100644 index 9fb32677d1..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.ChunkFiltererApproach.html +++ /dev/null @@ -1,1189 +0,0 @@ - - - - - - - - sparknlp_jsl.annotator.ChunkFiltererApproach — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.annotator.ChunkFiltererApproach

-
-
-class sparknlp_jsl.annotator.ChunkFiltererApproach(classname='com.johnsnowlabs.nlp.annotators.chunker.ChunkFiltererApproach')[source]
-

Bases: sparknlp.common.AnnotatorApproach

-
-
Model that Filters entities coming from CHUNK annotations. Filters can be set via a white list of terms or a regular expression.

White list criteria is enabled by default. To use regex, criteria has to be set to regex.

-
-
- ---- - - - - - - - - - - -

Input Annotation types

Output Annotation type

DOCUMENT, CHUNK, ASSERTION

CHUNK

-
-
Parameters
-
-
whiteList

If defined, list of entities to process. The rest will be ignored

-
-
regex

If defined, list of entities to process. The rest will be ignored.

-
-
criteria
-
Tag representing what is the criteria to filter the chunks. possibles values (assertion|isIn|regex)

isIn : Filter by the chunk -regex : Filter using a regex

-
-
-
-
entitiesConfidence

Path to csv with pairs (entity,confidenceThreshold). Filter the chunks with entities which have confidence lower than the confidence threshold.

-
-
-
-
-

Examples

-
>>> import sparknlp
->>> from sparknlp.base import *
->>> from sparknlp.common import *
->>> from sparknlp.annotator import *
->>> from sparknlp.training import *
->>> import sparknlp_jsl
->>> from sparknlp_jsl.base import *
->>> from sparknlp_jsl.annotator import *
->>> from pyspark.ml import Pipeline
->>> data = spark.createDataFrame([["Has a past history of gastroenteritis and stomach pain, however patient ..."]]).toDF("text")
->>> docAssembler = DocumentAssembler().setInputCol("text").setOutputCol("document")
->>> sentenceDetector = SentenceDetector().setInputCols(["document"]).setOutputCol("sentence")
->>> tokenizer = Tokenizer().setInputCols(["sentence"]).setOutputCol("token")
->>> posTagger = PerceptronModel.pretrained()     ...    .setInputCols(["sentence", "token"])     ...    .setOutputCol("pos")
->>> chunker = Chunker()     ...   .setInputCols(["pos", "sentence"])     ...   .setOutputCol("chunk")     ...   .setRegexParsers(["(<NN>)+"])
-...
->>> chunkerFilter = ChunkFiltererApproach()     ...   .setInputCols(["sentence","chunk"])     ...   .setOutputCol("filtered")     ...   .setCriteria("isin")     ...   .setWhiteList(["gastroenteritis"])
-...
->>> pipeline = Pipeline(stages=[
-...   docAssembler,
-...   sentenceDetector,
-...   tokenizer,
-...   posTagger,
-...   chunker,
-...   chunkerFilter])
-...
->>> result = pipeline.fit(data).transform(data)
->>> result.selectExpr("explode(chunk)").show(truncate=False)
-
-
-
>>> result.selectExpr("explode(chunk)").show(truncate=False)
-+---------------------------------------------------------------------------------+
-|col                                                                              |
-+---------------------------------------------------------------------------------+
-|{chunk, 11, 17, history, {sentence -> 0, chunk -> 0}, []}                        |
-|{chunk, 22, 36, gastroenteritis, {sentence -> 0, chunk -> 1}, []}                |
-|{chunk, 42, 53, stomach pain, {sentence -> 0, chunk -> 2}, []}                   |
-|{chunk, 64, 70, patient, {sentence -> 0, chunk -> 3}, []}                        |
-|{chunk, 81, 110, stomach pain now.We don't care, {sentence -> 0, chunk -> 4}, []}|
-|{chunk, 118, 132, gastroenteritis, {sentence -> 0, chunk -> 5}, []}              |
-+---------------------------------------------------------------------------------+
-
-
-
>>> result.selectExpr("explode(filtered)").show(truncate=False)
-+-------------------------------------------------------------------+
-|col                                                                |
-+-------------------------------------------------------------------+
-|{chunk, 22, 36, gastroenteritis, {sentence -> 0, chunk -> 1}, []}  |
-|{chunk, 118, 132, gastroenteritis, {sentence -> 0, chunk -> 5}, []}|
-+-------------------------------------------------------------------+
-
-
-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__([classname])

clear(param)

Clears a param from the param map if it has been explicitly set.

copy([extra])

Creates a copy of this instance with the same uid and some extra params.

explainParam(param)

Explains a single param and returns its name, doc, and optional default value and user-supplied value in a string.

explainParams()

Returns the documentation of all params with their optionally default values and user-supplied values.

extractParamMap([extra])

Extracts the embedded default param values and user-supplied values, and then merges them with extra values from input into a flat param map, where the latter value is used if there exist conflicts, i.e., with ordering: default param values < user-supplied values < extra.

fit(dataset[, params])

Fits a model to the input dataset with optional parameters.

fitMultiple(dataset, paramMaps)

Fits a model to the input dataset for each param map in paramMaps.

getInputCols()

Gets current column names of input annotations.

getLazyAnnotator()

Gets whether Annotator should be evaluated lazily in a RecursivePipeline.

getOrDefault(param)

Gets the value of a param in the user-supplied param map or its default value.

getOutputCol()

Gets output column name of annotations.

getParam(paramName)

Gets a param by its name.

getParamValue(paramName)

Gets the value of a parameter.

hasDefault(param)

Checks whether a param has a default value.

hasParam(paramName)

Tests whether this instance contains a param with a given (string) name.

isDefined(param)

Checks whether a param is explicitly set by user or has a default value.

isSet(param)

Checks whether a param is explicitly set by user.

load(path)

Reads an ML instance from the input path, a shortcut of read().load(path).

read()

Returns an MLReader instance for this class.

save(path)

Save this ML instance to the given path, a shortcut of 'write().save(path)'.

set(param, value)

Sets a parameter in the embedded param map.

setCriteria(s)

Set tag representing what is the criteria to filter the chunks.

setEntitiesConfidenceResource(path[, ...])

setFilterEntity(s)

Set tag representing what is the criteria to filter the chunks.

setInputCols(*value)

Sets column names of input annotations.

setLazyAnnotator(value)

Sets whether Annotator should be evaluated lazily in a RecursivePipeline.

setOutputCol(value)

Sets output column name of annotations.

setParamValue(paramName)

Sets the value of a parameter.

setRegex(value)

Sets llist of regex to process.

setWhiteList(value)

Sets list of entities to process.

write()

Returns an MLWriter instance for this ML instance.

-

Attributes

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

criteria

entitiesConfidenceResource

filterValue

getter_attrs

inputCols

lazyAnnotator

name

outputCol

params

Returns all params ordered by name.

regex

whiteList

-
-
-clear(param)
-

Clears a param from the param map if it has been explicitly set.

-
- -
-
-copy(extra=None)
-

Creates a copy of this instance with the same uid and some -extra params. This implementation first calls Params.copy and -then make a copy of the companion Java pipeline component with -extra params. So both the Python wrapper and the Java pipeline -component get copied.

-
-
Parameters
-
-
extradict, optional

Extra parameters to copy to the new instance

-
-
-
-
Returns
-
-
JavaParams

Copy of this instance

-
-
-
-
-
- -
-
-explainParam(param)
-

Explains a single param and returns its name, doc, and optional -default value and user-supplied value in a string.

-
- -
-
-explainParams()
-

Returns the documentation of all params with their optionally -default values and user-supplied values.

-
- -
-
-extractParamMap(extra=None)
-

Extracts the embedded default param values and user-supplied -values, and then merges them with extra values from input into -a flat param map, where the latter value is used if there exist -conflicts, i.e., with ordering: default param values < -user-supplied values < extra.

-
-
Parameters
-
-
extradict, optional

extra param values

-
-
-
-
Returns
-
-
dict

merged param map

-
-
-
-
-
- -
-
-fit(dataset, params=None)
-

Fits a model to the input dataset with optional parameters.

-
-

New in version 1.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset.

-
-
paramsdict or list or tuple, optional

an optional param map that overrides embedded params. If a list/tuple of -param maps is given, this calls fit on each param map and returns a list of -models.

-
-
-
-
Returns
-
-
Transformer or a list of Transformer

fitted model(s)

-
-
-
-
-
- -
-
-fitMultiple(dataset, paramMaps)
-

Fits a model to the input dataset for each param map in paramMaps.

-
-

New in version 2.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset.

-
-
paramMapscollections.abc.Sequence

A Sequence of param maps.

-
-
-
-
Returns
-
-
_FitMultipleIterator

A thread safe iterable which contains one model for each param map. Each -call to next(modelIterator) will return (index, model) where model was fit -using paramMaps[index]. index values may not be sequential.

-
-
-
-
-
- -
-
-getInputCols()
-

Gets current column names of input annotations.

-
- -
-
-getLazyAnnotator()
-

Gets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
- -
-
-getOrDefault(param)
-

Gets the value of a param in the user-supplied param map or its -default value. Raises an error if neither is set.

-
- -
-
-getOutputCol()
-

Gets output column name of annotations.

-
- -
-
-getParam(paramName)
-

Gets a param by its name.

-
- -
-
-getParamValue(paramName)
-

Gets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-hasDefault(param)
-

Checks whether a param has a default value.

-
- -
-
-hasParam(paramName)
-

Tests whether this instance contains a param with a given -(string) name.

-
- -
-
-isDefined(param)
-

Checks whether a param is explicitly set by user or has -a default value.

-
- -
-
-isSet(param)
-

Checks whether a param is explicitly set by user.

-
- -
-
-classmethod load(path)
-

Reads an ML instance from the input path, a shortcut of read().load(path).

-
- -
-
-property params
-

Returns all params ordered by name. The default implementation -uses dir() to get all attributes of type -Param.

-
- -
-
-classmethod read()
-

Returns an MLReader instance for this class.

-
- -
-
-save(path)
-

Save this ML instance to the given path, a shortcut of ‘write().save(path)’.

-
- -
-
-set(param, value)
-

Sets a parameter in the embedded param map.

-
- -
-
-setCriteria(s)[source]
-

Set tag representing what is the criteria to filter the chunks. possibles values (isIn|regex)

-
-
Parameters
-
-
sstr

List of dash-separated pairs of named entities

-
-
-
-
-
- -
-
-setFilterEntity(s)[source]
-

Set tag representing what is the criteria to filter the chunks. possibles values (assertion|isIn|regex)

-
-
Parameters
-
-
sstr

possibles values result|entity.

-
-
-
-
-
- -
-
-setInputCols(*value)
-

Sets column names of input annotations.

-
-
Parameters
-
-
*valuestr

Input columns for the annotator

-
-
-
-
-
- -
-
-setLazyAnnotator(value)
-

Sets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
-
Parameters
-
-
valuebool

Whether Annotator should be evaluated lazily in a -RecursivePipeline

-
-
-
-
-
- -
-
-setOutputCol(value)
-

Sets output column name of annotations.

-
-
Parameters
-
-
valuestr

Name of output column

-
-
-
-
-
- -
-
-setParamValue(paramName)
-

Sets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-setRegex(value)[source]
-

Sets llist of regex to process. The rest will be ignored.

-
-
Parameters
-
-
valuelist

List of dash-separated pairs of named entities

-
-
-
-
-
- -
-
-setWhiteList(value)[source]
-

Sets list of entities to process. The rest will be ignored.

-
-
Parameters
-
-
valuelist

If defined, list of entities to process. The rest will be ignored.

-
-
-
-
-
- -
-
-uid
-

A unique id for the object.

-
- -
-
-write()
-

Returns an MLWriter instance for this ML instance.

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.ChunkKeyPhraseExtraction.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.ChunkKeyPhraseExtraction.html deleted file mode 100644 index 334f3cfd72..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.ChunkKeyPhraseExtraction.html +++ /dev/null @@ -1,1465 +0,0 @@ - - - - - - - - sparknlp_jsl.annotator.ChunkKeyPhraseExtraction — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.annotator.ChunkKeyPhraseExtraction

-
-
-class sparknlp_jsl.annotator.ChunkKeyPhraseExtraction(classname='com.johnsnowlabs.nlp.embeddings.ChunkKeyPhraseExtraction', java_model=None)[source]
-

Bases: sparknlp.annotator.BertSentenceEmbeddings

-

Chunk KeyPhrase Extraction uses Bert Sentence Embeddings to determine the most relevant key phrases describing a -text. The input to the model consists of chunk annotations and sentence or document annotation. The model compares -the chunks against the corresponding sentences/documents and selects the chunks which are most representative of -the broader text context (i.e. the document or the sentence they belong to). The key phrases candidates (i.e. the -input chunks) can be generated in various ways, e.g. by NGramGenerator, TextMatcher or NerConverter. The model -operates either at sentence (selecting the most descriptive chunks from the sentence they belong to) or at document -level. In the latter case, the key phrases are selected to represent all the input document annotations.

- ---- - - - - - - - - - - -

Input Annotation types

Output Annotation type

DOCUMENT, CHUNK

CHUNK

-
-
Parameters
-
-
topN

The number of key phrases to select.

-
-
selectMostDifferent

Finds the topN * 2 key phrases and then selects topN of them, such as that they are the most different from -each other

-
-
divergence

The divergence value determines how different from each the extracted key phrases are. Uses Maximal Marginal -Relevance (MMR). MMR should not be used in conjunction with selectMostDifferent as they aim to achieve the same -goal, but in different ways.

-
-
documentLevelProcessing

Extract key phrases from the whole document from particular sentences which the chunks refer to.

-
-
concatenateSentences

Concatenate the input sentence/documentation annotations before computing their embeddings.

-
-
-
-
-

Examples

-
>>> documenter = sparknlp.DocumentAssembler()     ...     .setInputCol("text")     ...     .setOutputCol("document")
-...
->>> sentencer = sparknlp.annotators.SentenceDetector()     ...     .setInputCols(["document"])    ...     .setOutputCol("sentences")
-...
->>> tokenizer = sparknlp.annotators.Tokenizer()     ...     .setInputCols(["document"])     ...     .setOutputCol("tokens")     ...
->>>  embeddings = sparknlp.annotators.WordEmbeddingsModel()     ...     .pretrained("embeddings_clinical", "en", "clinical/models")     ...     .setInputCols(["document", "tokens"])     ...     .setOutputCol("embeddings")
-...
->>> ner_tagger = MedicalNerModel()     ...     .pretrained("ner_jsl_slim", "en", "clinical/models")     ...     .setInputCols(["sentences", "tokens", "embeddings"])     ...     .setOutputCol("ner_tags")
-...
->>> ner_converter = NerConverter()    ...     .setInputCols("sentences", "tokens", "ner_tags")    ...     .setOutputCol("ner_chunks")
-...
->>> key_phrase_extractor = ChunkKeyPhraseExtraction    ...     .pretrained()    ...     .setTopN(1)    ...     .setDocumentLevelProcessing(False)    ...     .setDivergence(0.4)    ...     .setInputCols(["sentences", "ner_chunks"])    ...     .setOutputCol("ner_chunk_key_phrases")
-...
->>> pipeline = sparknlp.base.Pipeline()     ...     .setStages([documenter, sentencer, tokenizer, embeddings, ner_tagger, ner_converter, key_phrase_extractor])
-...
->>> data = spark.createDataFrame([["Her Diabetes has become type 2 in the last year with her Diabetes.He complains of swelling in his right forearm."]]).toDF("text")
->>> results = pipeline.fit(data).transform(data)
->>> results    ...     .selectExpr("explode(ner_chunk_key_phrases) AS key_phrase")    ...     .selectExpr(
-...         "key_phrase.result",
-...         "key_phrase.metadata.entity",
-...         "key_phrase.metadata.DocumentSimilarity",
-...         "key_phrase.metadata.MMRScore")    ...     .show(truncate=False)
-
-
- ----- - - - - - - - - - - -

result

DocumentSimilarity

MMRScore

gestational diabetes mellitus -28-year-old -type two diabetes mellitus

0.7391447825527298 -0.4366776288430703 -0.7323921930094919

0.44348688715422274 -0.13577881610104517 -0.085800103824974

-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__([classname, java_model])

Initialize this instance with a Java model object.

clear(param)

Clears a param from the param map if it has been explicitly set.

copy([extra])

Creates a copy of this instance with the same uid and some extra params.

explainParam(param)

Explains a single param and returns its name, doc, and optional default value and user-supplied value in a string.

explainParams()

Returns the documentation of all params with their optionally default values and user-supplied values.

extractParamMap([extra])

Extracts the embedded default param values and user-supplied values, and then merges them with extra values from input into a flat param map, where the latter value is used if there exist conflicts, i.e., with ordering: default param values < user-supplied values < extra.

getBatchSize()

Gets current batch size.

getCaseSensitive()

Gets whether to ignore case in tokens for embeddings matching.

getDimension()

Gets embeddings dimension.

getInputCols()

Gets current column names of input annotations.

getLazyAnnotator()

Gets whether Annotator should be evaluated lazily in a RecursivePipeline.

getOrDefault(param)

Gets the value of a param in the user-supplied param map or its default value.

getOutputCol()

Gets output column name of annotations.

getParam(paramName)

Gets a param by its name.

getParamValue(paramName)

Gets the value of a parameter.

getStorageRef()

Gets unique reference name for identification.

hasDefault(param)

Checks whether a param has a default value.

hasParam(paramName)

Tests whether this instance contains a param with a given (string) name.

isDefined(param)

Checks whether a param is explicitly set by user or has a default value.

isSet(param)

Checks whether a param is explicitly set by user.

load(path)

Reads an ML instance from the input path, a shortcut of read().load(path).

loadSavedModel(folder, spark_session)

Loads a locally saved model.

pretrained([name, lang, remote_loc])

Downloads and loads a pretrained model.

read()

Returns an MLReader instance for this class.

save(path)

Save this ML instance to the given path, a shortcut of 'write().save(path)'.

set(param, value)

Sets a parameter in the embedded param map.

setBatchSize(v)

Sets batch size.

setCaseSensitive(value)

Sets whether to ignore case in tokens for embeddings matching.

setConcatenateSentences(value)

Concatenate the input sentence/documentation annotations before computing their embeddings.

setConfigProtoBytes(b)

Sets configProto from tensorflow, serialized into byte array.

setDimension(value)

Sets embeddings dimension.

setDivergence(value)

Set the level of divergence of the extracted key phrases. The value should be in the interval [0, 1].

setDocumentLevelProcessing(value)

Extract key phrases from the whole document or from particular sentences which the chunks refer to.

setDropPunctuation(value)

This parameter determines whether to remove punctuation marks from the input chunks.

setInputCols(*value)

Sets column names of input annotations.

setIsLong(value)

Sets whether to use Long type instead of Int type for inputs buffer.

setLazyAnnotator(value)

Sets whether Annotator should be evaluated lazily in a RecursivePipeline.

setMaxSentenceLength(value)

Sets max sentence length to process.

setOutputCol(value)

Sets output column name of annotations.

setParamValue(paramName)

Sets the value of a parameter.

setParams()

setSelectMostDifferent(value)

Let the model return the top N key phrases which are the most different from each other.

setStorageRef(value)

Sets unique reference name for identification.

setTopN(value)

Set the number of key phrases to extract.

transform(dataset[, params])

Transforms the input dataset with optional parameters.

write()

Returns an MLWriter instance for this ML instance.

-

Attributes

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

batchSize

caseSensitive

concatenateSentences

configProtoBytes

dimension

divergence

documentLevelProcessing

dropPunctuation

getter_attrs

inputCols

isLong

lazyAnnotator

maxSentenceLength

name

outputCol

params

Returns all params ordered by name.

selectMostDifferent

storageRef

topN

-
-
-clear(param)
-

Clears a param from the param map if it has been explicitly set.

-
- -
-
-copy(extra=None)
-

Creates a copy of this instance with the same uid and some -extra params. This implementation first calls Params.copy and -then make a copy of the companion Java pipeline component with -extra params. So both the Python wrapper and the Java pipeline -component get copied.

-
-
Parameters
-
-
extradict, optional

Extra parameters to copy to the new instance

-
-
-
-
Returns
-
-
JavaParams

Copy of this instance

-
-
-
-
-
- -
-
-explainParam(param)
-

Explains a single param and returns its name, doc, and optional -default value and user-supplied value in a string.

-
- -
-
-explainParams()
-

Returns the documentation of all params with their optionally -default values and user-supplied values.

-
- -
-
-extractParamMap(extra=None)
-

Extracts the embedded default param values and user-supplied -values, and then merges them with extra values from input into -a flat param map, where the latter value is used if there exist -conflicts, i.e., with ordering: default param values < -user-supplied values < extra.

-
-
Parameters
-
-
extradict, optional

extra param values

-
-
-
-
Returns
-
-
dict

merged param map

-
-
-
-
-
- -
-
-getBatchSize()
-

Gets current batch size.

-
-
Returns
-
-
int

Current batch size

-
-
-
-
-
- -
-
-getCaseSensitive()
-

Gets whether to ignore case in tokens for embeddings matching.

-
-
Returns
-
-
bool

Whether to ignore case in tokens for embeddings matching

-
-
-
-
-
- -
-
-getDimension()
-

Gets embeddings dimension.

-
- -
-
-getInputCols()
-

Gets current column names of input annotations.

-
- -
-
-getLazyAnnotator()
-

Gets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
- -
-
-getOrDefault(param)
-

Gets the value of a param in the user-supplied param map or its -default value. Raises an error if neither is set.

-
- -
-
-getOutputCol()
-

Gets output column name of annotations.

-
- -
-
-getParam(paramName)
-

Gets a param by its name.

-
- -
-
-getParamValue(paramName)
-

Gets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-getStorageRef()
-

Gets unique reference name for identification.

-
-
Returns
-
-
str

Unique reference name for identification

-
-
-
-
-
- -
-
-hasDefault(param)
-

Checks whether a param has a default value.

-
- -
-
-hasParam(paramName)
-

Tests whether this instance contains a param with a given -(string) name.

-
- -
-
-isDefined(param)
-

Checks whether a param is explicitly set by user or has -a default value.

-
- -
-
-isSet(param)
-

Checks whether a param is explicitly set by user.

-
- -
-
-classmethod load(path)
-

Reads an ML instance from the input path, a shortcut of read().load(path).

-
- -
-
-static loadSavedModel(folder, spark_session)
-

Loads a locally saved model.

-
-
Parameters
-
-
folderstr

Folder of the saved model

-
-
spark_sessionpyspark.sql.SparkSession

The current SparkSession

-
-
-
-
Returns
-
-
BertSentenceEmbeddings

The restored model

-
-
-
-
-
- -
-
-property params
-

Returns all params ordered by name. The default implementation -uses dir() to get all attributes of type -Param.

-
- -
-
-static pretrained(name='sbert_jsl_medium_uncased', lang='en', remote_loc='clinical/models')[source]
-

Downloads and loads a pretrained model.

-
-
Parameters
-
-
namestr, optional

Name of the pretrained model, by default “sent_small_bert_L2_768”

-
-
langstr, optional

Language of the pretrained model, by default “en”

-
-
remote_locstr, optional

Optional remote address of the resource, by default None. Will use -Spark NLPs repositories otherwise.

-
-
-
-
Returns
-
-
BertSentenceEmbeddings

The restored model

-
-
-
-
-
- -
-
-classmethod read()
-

Returns an MLReader instance for this class.

-
- -
-
-save(path)
-

Save this ML instance to the given path, a shortcut of ‘write().save(path)’.

-
- -
-
-set(param, value)
-

Sets a parameter in the embedded param map.

-
- -
-
-setBatchSize(v)
-

Sets batch size.

-
-
Parameters
-
-
vint

Batch size

-
-
-
-
-
- -
-
-setCaseSensitive(value)
-

Sets whether to ignore case in tokens for embeddings matching.

-
-
Parameters
-
-
valuebool

Whether to ignore case in tokens for embeddings matching

-
-
-
-
-
- -
-
-setConcatenateSentences(value)[source]
-

Concatenate the input sentence/documentation annotations before computing their embeddings. This parameter -is only used if documentLevelProcessing is true. If concatenateSentences is set to true, the model will -concatenate the document/sentence input annotations and compute a single embedding. If it is false, the model -will compute the embedding of each sentence separately and then average the resulting embedding vectors. -The default value is ‘false’.

-
-
Parameters
-
-
valueboolean

Whether to concatenate the input sentence/document annotations in order to compute the embedding of the -whole document.

-
-
-
-
-
- -
-
-setConfigProtoBytes(b)
-

Sets configProto from tensorflow, serialized into byte array.

-
-
Parameters
-
-
bList[str]

ConfigProto from tensorflow, serialized into byte array

-
-
-
-
-
- -
-
-setDimension(value)
-

Sets embeddings dimension.

-
-
Parameters
-
-
valueint

Embeddings dimension

-
-
-
-
-
- -
-
-setDivergence(value)[source]
-
-
Set the level of divergence of the extracted key phrases. The value should be in the interval [0, 1].

This parameter should not be used if setSelectMostDifferent is true - the two parameters aim to achieve the -same goal in different ways. The default is 0, i.e. there is no constraint on the order of key phrases

-
-

extracted.

-
-
-
-
-
Parameters
-
-
valuefloat

Divergence value

-
-
-
-
-
- -
-
-setDocumentLevelProcessing(value)[source]
-
-
Extract key phrases from the whole document or from particular sentences which the chunks refer to.

The default value is ‘false’.

-
-
-
-
Parameters
-
-
valueboolean

Whether to extract key phrases from the whole document(all sentences).

-
-
-
-
-
- -
-
-setDropPunctuation(value)[source]
-

This parameter determines whether to remove punctuation marks from the input chunks. Chunks coming from NER -models are not affected. -The default value is ‘true’.

-
-
Parameters
-
-
valueboolean

Whether to remove punctuation marks from input chunks.

-
-
-
-
-
- -
-
-setInputCols(*value)
-

Sets column names of input annotations.

-
-
Parameters
-
-
*valuestr

Input columns for the annotator

-
-
-
-
-
- -
-
-setIsLong(value)
-

Sets whether to use Long type instead of Int type for inputs buffer.

-

Some Bert models require Long instead of Int.

-
-
Parameters
-
-
valuebool

Whether to use Long type instead of Int type for inputs buffer

-
-
-
-
-
- -
-
-setLazyAnnotator(value)
-

Sets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
-
Parameters
-
-
valuebool

Whether Annotator should be evaluated lazily in a -RecursivePipeline

-
-
-
-
-
- -
-
-setMaxSentenceLength(value)
-

Sets max sentence length to process.

-
-
Parameters
-
-
valueint

Max sentence length to process

-
-
-
-
-
- -
-
-setOutputCol(value)
-

Sets output column name of annotations.

-
-
Parameters
-
-
valuestr

Name of output column

-
-
-
-
-
- -
-
-setParamValue(paramName)
-

Sets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-setSelectMostDifferent(value)[source]
-

Let the model return the top N key phrases which are the most different from each other. Using this paramter -only makes sense if the divergence parameter is set to 0. The default value is ‘false’

-
-
Parameters
-
-
valueboolean

whether to select the most different key phrases or not.

-
-
-
-
-
- -
-
-setStorageRef(value)
-

Sets unique reference name for identification.

-
-
Parameters
-
-
valuestr

Unique reference name for identification

-
-
-
-
-
- -
-
-setTopN(value)[source]
-

Set the number of key phrases to extract. The default value is 3.

-
-
Parameters
-
-
valueinteger

Number of key phrases to extract.

-
-
-
-
-
- -
-
-transform(dataset, params=None)
-

Transforms the input dataset with optional parameters.

-
-

New in version 1.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset

-
-
paramsdict, optional

an optional param map that overrides embedded params.

-
-
-
-
Returns
-
-
pyspark.sql.DataFrame

transformed dataset

-
-
-
-
-
- -
-
-uid
-

A unique id for the object.

-
- -
-
-write()
-

Returns an MLWriter instance for this ML instance.

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.ChunkMergeApproach.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.ChunkMergeApproach.html deleted file mode 100644 index 0fe07033bf..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.ChunkMergeApproach.html +++ /dev/null @@ -1,1199 +0,0 @@ - - - - - - - - sparknlp_jsl.annotator.ChunkMergeApproach — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.annotator.ChunkMergeApproach

-
-
-class sparknlp_jsl.annotator.ChunkMergeApproach[source]
-

Bases: sparknlp.common.AnnotatorApproach

-

Merges two chunk columns coming from two annotators(NER, ContextualParser or any other annotator producing -chunks). The merger of the two chunk columns is made by selecting one chunk from one of the columns according -to certain criteria. -The decision on which chunk to select is made according to the chunk indices in the source document. -(chunks with longer lengths and highest information will be kept from each source) -Labels can be changed by setReplaceDictResource.

- ---- - - - - - - - - - - -

Input Annotation types

Output Annotation type

CHUNK,CHUNK

CHUNK

-
-
Parameters
-
-
mergeOverlapping

whether to merge overlapping matched chunks. Defaults false

-
-
falsePositivesResource

file with false positive pairs

-
-
replaceDictResource

replace dictionary pairs

-
-
chunkPrecedence

Select what is the precedence when two chunks have the same start and end indices. Possible values are [entity|identifier|field]

-
-
blackList

If defined, list of entities to ignore. The rest will be proccessed.

-
-
-
-
-

Examples

-
>>> import sparknlp
->>> from sparknlp.base import *
->>> from sparknlp.common import *
->>> from sparknlp.annotator import *
->>> from sparknlp.training import *
->>> import sparknlp_jsl
->>> from sparknlp_jsl.base import *
->>> from sparknlp_jsl.annotator import *
->>> from pyspark.ml import Pipeline
-Define a pipeline with 2 different NER models with a ChunkMergeApproach at the end
->>> data = spark.createDataFrame([["A 63-year-old man presents to the hospital ..."]]).toDF("text")
->>> pipeline = Pipeline(stages=[
-...  DocumentAssembler().setInputCol("text").setOutputCol("document"),
-...  SentenceDetector().setInputCols(["document"]).setOutputCol("sentence"),
-...  Tokenizer().setInputCols(["sentence"]).setOutputCol("token"),
-...   WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models").setOutputCol("embs"),
-...   MedicalNerModel.pretrained("ner_jsl", "en", "clinical/models") \
-...     .setInputCols(["sentence", "token", "embs"]).setOutputCol("jsl_ner"),
-...  NerConverter().setInputCols(["sentence", "token", "jsl_ner"]).setOutputCol("jsl_ner_chunk"),
-...   MedicalNerModel.pretrained("ner_bionlp", "en", "clinical/models") \
-...     .setInputCols(["sentence", "token", "embs"]).setOutputCol("bionlp_ner"),
-...  NerConverter().setInputCols(["sentence", "token", "bionlp_ner"]) \
-...     .setOutputCol("bionlp_ner_chunk"),
-...  ChunkMergeApproach().setInputCols(["jsl_ner_chunk", "bionlp_ner_chunk"]).setOutputCol("merged_chunk")
->>> ])
->>> result = pipeline.fit(data).transform(data).cache()
->>> result.selectExpr("explode(merged_chunk) as a") \
-...   .selectExpr("a.begin","a.end","a.result as chunk","a.metadata.entity as entity") \
-...   .show(5, False)
-+-----+---+-----------+---------+
-|begin|end|chunk      |entity   |
-+-----+---+-----------+---------+
-|5    |15 |63-year-old|Age      |
-|17   |19 |man        |Gender   |
-|64   |72 |recurrent  |Modifier |
-|98   |107|cellulitis |Diagnosis|
-|110  |119|pneumonias |Diagnosis|
-+-----+---+-----------+---------+
-
-
-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__()

clear(param)

Clears a param from the param map if it has been explicitly set.

copy([extra])

Creates a copy of this instance with the same uid and some extra params.

explainParam(param)

Explains a single param and returns its name, doc, and optional default value and user-supplied value in a string.

explainParams()

Returns the documentation of all params with their optionally default values and user-supplied values.

extractParamMap([extra])

Extracts the embedded default param values and user-supplied values, and then merges them with extra values from input into a flat param map, where the latter value is used if there exist conflicts, i.e., with ordering: default param values < user-supplied values < extra.

fit(dataset[, params])

Fits a model to the input dataset with optional parameters.

fitMultiple(dataset, paramMaps)

Fits a model to the input dataset for each param map in paramMaps.

getInputCols()

Gets current column names of input annotations.

getLazyAnnotator()

Gets whether Annotator should be evaluated lazily in a RecursivePipeline.

getOrDefault(param)

Gets the value of a param in the user-supplied param map or its default value.

getOutputCol()

Gets output column name of annotations.

getParam(paramName)

Gets a param by its name.

getParamValue(paramName)

Gets the value of a parameter.

hasDefault(param)

Checks whether a param has a default value.

hasParam(paramName)

Tests whether this instance contains a param with a given (string) name.

isDefined(param)

Checks whether a param is explicitly set by user or has a default value.

isSet(param)

Checks whether a param is explicitly set by user.

load(path)

Reads an ML instance from the input path, a shortcut of read().load(path).

read()

Returns an MLReader instance for this class.

save(path)

Save this ML instance to the given path, a shortcut of 'write().save(path)'.

set(param, value)

Sets a parameter in the embedded param map.

setBlackList(entities)

If defined, list of entities to ignore.

setChunkPrecedence(b)

Sets what is the precedence when two chunks have the same start and end indices.

setFalsePositivesResource(path[, read_as, ...])

Sets file with false positive pairs

setInputCols(*value)

Sets column names of input annotations.

setLazyAnnotator(value)

Sets whether Annotator should be evaluated lazily in a RecursivePipeline.

setMergeOverlapping(b)

Sets whether to merge overlapping matched chunks.

setOutputCol(value)

Sets output column name of annotations.

setParamValue(paramName)

Sets the value of a parameter.

setReplaceDictResource(path[, read_as, options])

Sets replace dictionary pairs

write()

Returns an MLWriter instance for this ML instance.

-

Attributes

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

blackList

chunkPrecedence

falsePositivesResource

getter_attrs

inputCols

lazyAnnotator

mergeOverlapping

name

outputCol

params

Returns all params ordered by name.

replaceDictResource

-
-
-clear(param)
-

Clears a param from the param map if it has been explicitly set.

-
- -
-
-copy(extra=None)
-

Creates a copy of this instance with the same uid and some -extra params. This implementation first calls Params.copy and -then make a copy of the companion Java pipeline component with -extra params. So both the Python wrapper and the Java pipeline -component get copied.

-
-
Parameters
-
-
extradict, optional

Extra parameters to copy to the new instance

-
-
-
-
Returns
-
-
JavaParams

Copy of this instance

-
-
-
-
-
- -
-
-explainParam(param)
-

Explains a single param and returns its name, doc, and optional -default value and user-supplied value in a string.

-
- -
-
-explainParams()
-

Returns the documentation of all params with their optionally -default values and user-supplied values.

-
- -
-
-extractParamMap(extra=None)
-

Extracts the embedded default param values and user-supplied -values, and then merges them with extra values from input into -a flat param map, where the latter value is used if there exist -conflicts, i.e., with ordering: default param values < -user-supplied values < extra.

-
-
Parameters
-
-
extradict, optional

extra param values

-
-
-
-
Returns
-
-
dict

merged param map

-
-
-
-
-
- -
-
-fit(dataset, params=None)
-

Fits a model to the input dataset with optional parameters.

-
-

New in version 1.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset.

-
-
paramsdict or list or tuple, optional

an optional param map that overrides embedded params. If a list/tuple of -param maps is given, this calls fit on each param map and returns a list of -models.

-
-
-
-
Returns
-
-
Transformer or a list of Transformer

fitted model(s)

-
-
-
-
-
- -
-
-fitMultiple(dataset, paramMaps)
-

Fits a model to the input dataset for each param map in paramMaps.

-
-

New in version 2.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset.

-
-
paramMapscollections.abc.Sequence

A Sequence of param maps.

-
-
-
-
Returns
-
-
_FitMultipleIterator

A thread safe iterable which contains one model for each param map. Each -call to next(modelIterator) will return (index, model) where model was fit -using paramMaps[index]. index values may not be sequential.

-
-
-
-
-
- -
-
-getInputCols()
-

Gets current column names of input annotations.

-
- -
-
-getLazyAnnotator()
-

Gets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
- -
-
-getOrDefault(param)
-

Gets the value of a param in the user-supplied param map or its -default value. Raises an error if neither is set.

-
- -
-
-getOutputCol()
-

Gets output column name of annotations.

-
- -
-
-getParam(paramName)
-

Gets a param by its name.

-
- -
-
-getParamValue(paramName)
-

Gets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-hasDefault(param)
-

Checks whether a param has a default value.

-
- -
-
-hasParam(paramName)
-

Tests whether this instance contains a param with a given -(string) name.

-
- -
-
-isDefined(param)
-

Checks whether a param is explicitly set by user or has -a default value.

-
- -
-
-isSet(param)
-

Checks whether a param is explicitly set by user.

-
- -
-
-classmethod load(path)
-

Reads an ML instance from the input path, a shortcut of read().load(path).

-
- -
-
-property params
-

Returns all params ordered by name. The default implementation -uses dir() to get all attributes of type -Param.

-
- -
-
-classmethod read()
-

Returns an MLReader instance for this class.

-
- -
-
-save(path)
-

Save this ML instance to the given path, a shortcut of ‘write().save(path)’.

-
- -
-
-set(param, value)
-

Sets a parameter in the embedded param map.

-
- -
-
-setBlackList(entities)[source]
-

If defined, list of entities to ignore. The rest will be processed.

-
-
Parameters
-
-
entitieslist

If defined, list of entities to ignore. The rest will be processed.

-
-
-
-
-
- -
-
-setChunkPrecedence(b)[source]
-

Sets what is the precedence when two chunks have the same start and end indices. Possible values are [entity|identifier|field]

-
-
Parameters
-
-
bstr

Select what is the precedence when two chunks have the same start and end indices. Possible values are [entity|identifier|field]

-
-
-
-
-
- -
-
-setFalsePositivesResource(path, read_as='TEXT', options={'delimiter': ','})[source]
-

Sets file with false positive pairs

-
-
Parameters
-
-
pathstr

Path to the external resource

-
-
read_asstr, optional

How to read the resource, by default ReadAs.TEXT

-
-
optionsdict, optional

Options for reading the resource, by default {“format”: “text”}

-
-
-
-
-
- -
-
-setInputCols(*value)
-

Sets column names of input annotations.

-
-
Parameters
-
-
*valuestr

Input columns for the annotator

-
-
-
-
-
- -
-
-setLazyAnnotator(value)
-

Sets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
-
Parameters
-
-
valuebool

Whether Annotator should be evaluated lazily in a -RecursivePipeline

-
-
-
-
-
- -
-
-setMergeOverlapping(b)[source]
-

Sets whether to merge overlapping matched chunks. Defaults false

-
-
Parameters
-
-
bbool

whether to merge overlapping matched chunks. Defaults false

-
-
-
-
-
- -
-
-setOutputCol(value)
-

Sets output column name of annotations.

-
-
Parameters
-
-
valuestr

Name of output column

-
-
-
-
-
- -
-
-setParamValue(paramName)
-

Sets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-setReplaceDictResource(path, read_as='TEXT', options={'delimiter': ','})[source]
-

Sets replace dictionary pairs

-
-
Parameters
-
-
pathstr

Path to the external resource

-
-
read_asstr, optional

How to read the resource, by default ReadAs.TEXT

-
-
optionsdict, optional

Options for reading the resource, by default {“format”: “text”}

-
-
-
-
-
- -
-
-uid
-

A unique id for the object.

-
- -
-
-write()
-

Returns an MLWriter instance for this ML instance.

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.ChunkMergeModel.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.ChunkMergeModel.html deleted file mode 100644 index 0603baf34a..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.ChunkMergeModel.html +++ /dev/null @@ -1,1064 +0,0 @@ - - - - - - - - sparknlp_jsl.annotator.ChunkMergeModel — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.annotator.ChunkMergeModel

-
-
-class sparknlp_jsl.annotator.ChunkMergeModel(classname='com.johnsnowlabs.nlp.annotators.merge.ChunkMergeModel', java_model=None)[source]
-

Bases: sparknlp.common.AnnotatorModel

-

The model produced by ChunkMergerAproach.

- ---- - - - - - - - - - - -

Input Annotation types

Output Annotation type

CHUNK,CHUNK

CHUNK

-
-
Parameters
-
-
mergeOverlapping

whether to merge overlapping matched chunks. Defaults false

-
-
chunkPrecedence

Select what is the precedence when two chunks have the same start and end indices. Possible values are [entity|identifier|field]

-
-
blackList

If defined, list of entities to ignore. The rest will be proccessed.

-
-
-
-
-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__([classname, java_model])

Initialize this instance with a Java model object.

clear(param)

Clears a param from the param map if it has been explicitly set.

copy([extra])

Creates a copy of this instance with the same uid and some extra params.

explainParam(param)

Explains a single param and returns its name, doc, and optional default value and user-supplied value in a string.

explainParams()

Returns the documentation of all params with their optionally default values and user-supplied values.

extractParamMap([extra])

Extracts the embedded default param values and user-supplied values, and then merges them with extra values from input into a flat param map, where the latter value is used if there exist conflicts, i.e., with ordering: default param values < user-supplied values < extra.

getInputCols()

Gets current column names of input annotations.

getLazyAnnotator()

Gets whether Annotator should be evaluated lazily in a RecursivePipeline.

getOrDefault(param)

Gets the value of a param in the user-supplied param map or its default value.

getOutputCol()

Gets output column name of annotations.

getParam(paramName)

Gets a param by its name.

getParamValue(paramName)

Gets the value of a parameter.

hasDefault(param)

Checks whether a param has a default value.

hasParam(paramName)

Tests whether this instance contains a param with a given (string) name.

isDefined(param)

Checks whether a param is explicitly set by user or has a default value.

isSet(param)

Checks whether a param is explicitly set by user.

load(path)

Reads an ML instance from the input path, a shortcut of read().load(path).

pretrained(name[, lang, remote_loc])

read()

Returns an MLReader instance for this class.

save(path)

Save this ML instance to the given path, a shortcut of 'write().save(path)'.

set(param, value)

Sets a parameter in the embedded param map.

setChunkPrecedence(b)

Sets what is the precedence when two chunks have the same start and end indices.

setInputCols(*value)

Sets column names of input annotations.

setLazyAnnotator(value)

Sets whether Annotator should be evaluated lazily in a RecursivePipeline.

setMergeOverlapping(b)

Sets whether to merge overlapping matched chunks.

setOutputCol(value)

Sets output column name of annotations.

setParamValue(paramName)

Sets the value of a parameter.

setParams()

transform(dataset[, params])

Transforms the input dataset with optional parameters.

write()

Returns an MLWriter instance for this ML instance.

-

Attributes

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

blackList

chunkPrecedence

falsePositives

getter_attrs

inputCols

lazyAnnotator

mergeOverlapping

name

outputCol

params

Returns all params ordered by name.

replaceDict

-
-
-clear(param)
-

Clears a param from the param map if it has been explicitly set.

-
- -
-
-copy(extra=None)
-

Creates a copy of this instance with the same uid and some -extra params. This implementation first calls Params.copy and -then make a copy of the companion Java pipeline component with -extra params. So both the Python wrapper and the Java pipeline -component get copied.

-
-
Parameters
-
-
extradict, optional

Extra parameters to copy to the new instance

-
-
-
-
Returns
-
-
JavaParams

Copy of this instance

-
-
-
-
-
- -
-
-explainParam(param)
-

Explains a single param and returns its name, doc, and optional -default value and user-supplied value in a string.

-
- -
-
-explainParams()
-

Returns the documentation of all params with their optionally -default values and user-supplied values.

-
- -
-
-extractParamMap(extra=None)
-

Extracts the embedded default param values and user-supplied -values, and then merges them with extra values from input into -a flat param map, where the latter value is used if there exist -conflicts, i.e., with ordering: default param values < -user-supplied values < extra.

-
-
Parameters
-
-
extradict, optional

extra param values

-
-
-
-
Returns
-
-
dict

merged param map

-
-
-
-
-
- -
-
-getInputCols()
-

Gets current column names of input annotations.

-
- -
-
-getLazyAnnotator()
-

Gets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
- -
-
-getOrDefault(param)
-

Gets the value of a param in the user-supplied param map or its -default value. Raises an error if neither is set.

-
- -
-
-getOutputCol()
-

Gets output column name of annotations.

-
- -
-
-getParam(paramName)
-

Gets a param by its name.

-
- -
-
-getParamValue(paramName)
-

Gets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-hasDefault(param)
-

Checks whether a param has a default value.

-
- -
-
-hasParam(paramName)
-

Tests whether this instance contains a param with a given -(string) name.

-
- -
-
-isDefined(param)
-

Checks whether a param is explicitly set by user or has -a default value.

-
- -
-
-isSet(param)
-

Checks whether a param is explicitly set by user.

-
- -
-
-classmethod load(path)
-

Reads an ML instance from the input path, a shortcut of read().load(path).

-
- -
-
-property params
-

Returns all params ordered by name. The default implementation -uses dir() to get all attributes of type -Param.

-
- -
-
-classmethod read()
-

Returns an MLReader instance for this class.

-
- -
-
-save(path)
-

Save this ML instance to the given path, a shortcut of ‘write().save(path)’.

-
- -
-
-set(param, value)
-

Sets a parameter in the embedded param map.

-
- -
-
-setChunkPrecedence(b)[source]
-

Sets what is the precedence when two chunks have the same start and end indices. Possible values are [entity|identifier|field]

-
-
Parameters
-
-
bstr

Select what is the precedence when two chunks have the same start and end indices. Possible values are [entity|identifier|field]

-
-
-
-
-
- -
-
-setInputCols(*value)
-

Sets column names of input annotations.

-
-
Parameters
-
-
*valuestr

Input columns for the annotator

-
-
-
-
-
- -
-
-setLazyAnnotator(value)
-

Sets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
-
Parameters
-
-
valuebool

Whether Annotator should be evaluated lazily in a -RecursivePipeline

-
-
-
-
-
- -
-
-setMergeOverlapping(b)[source]
-

Sets whether to merge overlapping matched chunks. Defaults false

-
-
Parameters
-
-
vbool

Whether to merge overlapping matched chunks. Defaults false

-
-
-
-
-
- -
-
-setOutputCol(value)
-

Sets output column name of annotations.

-
-
Parameters
-
-
valuestr

Name of output column

-
-
-
-
-
- -
-
-setParamValue(paramName)
-

Sets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-transform(dataset, params=None)
-

Transforms the input dataset with optional parameters.

-
-

New in version 1.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset

-
-
paramsdict, optional

an optional param map that overrides embedded params.

-
-
-
-
Returns
-
-
pyspark.sql.DataFrame

transformed dataset

-
-
-
-
-
- -
-
-uid
-

A unique id for the object.

-
- -
-
-write()
-

Returns an MLWriter instance for this ML instance.

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.ChunkSentenceSplitter.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.ChunkSentenceSplitter.html deleted file mode 100644 index cf84b70c37..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.ChunkSentenceSplitter.html +++ /dev/null @@ -1,1086 +0,0 @@ - - - - - - - - sparknlp_jsl.annotator.ChunkSentenceSplitter — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.annotator.ChunkSentenceSplitter

-
-
-class sparknlp_jsl.annotator.ChunkSentenceSplitter(classname='com.johnsnowlabs.nlp.annotators.chunker.ChunkSentenceSplitter', java_model=None)[source]
-

Bases: sparknlp.common.AnnotatorModel

-

Split the document using the chunks that you provided,and put in the metadata the chunk entity. -The first piece of documento to the first chunk will have the entity as header.

-

Use the identifier or field as a entity.

- ---- - - - - - - - - - - -

Input Annotation types

Output Annotation type

DOCUMENT, CHUNK

DOCUMENT

-
-
Parameters
-
-
inputType

The type of the entity that you want to filter by default sentence_embeddings.Possible values -document|token|wordpiece|word_embeddings|sentence_embeddings|category|date|sentiment|pos|chunk|named_entity|regex|dependency|labeled_dependency|language|keyword

-
-
Examples
-
——–
-
>>> document = DocumentAssembler().setInputCol(“text”).setOutputCol(“document”)
-
**>>> regexMatcher = RegexMatcher().setExternalRules(“../src/test/resources/chunker/title_regex.txt”, “,”) **
-
**… .setInputCols(“document”) **
-
**… .setOutputCol(“chunks”) **
-
>>> chunkSentenceSplitter = ChunkSentenceSplitter().setInputCols(“chunks”,”document”).setOutputCol(“paragraphs”)
-
>>> pipeline = Pipeline().setStages([documentAssembler,regexMatcher,chunkSentenceSplitter])
-
>>> result = pipeline.fit(data).transform(data).select(“paragraphs”)
-
>>> result.show()
-
-
-
-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__([classname, java_model])

Initialize this instance with a Java model object.

clear(param)

Clears a param from the param map if it has been explicitly set.

copy([extra])

Creates a copy of this instance with the same uid and some extra params.

explainParam(param)

Explains a single param and returns its name, doc, and optional default value and user-supplied value in a string.

explainParams()

Returns the documentation of all params with their optionally default values and user-supplied values.

extractParamMap([extra])

Extracts the embedded default param values and user-supplied values, and then merges them with extra values from input into a flat param map, where the latter value is used if there exist conflicts, i.e., with ordering: default param values < user-supplied values < extra.

getInputCols()

Gets current column names of input annotations.

getLazyAnnotator()

Gets whether Annotator should be evaluated lazily in a RecursivePipeline.

getOrDefault(param)

Gets the value of a param in the user-supplied param map or its default value.

getOutputCol()

Gets output column name of annotations.

getParam(paramName)

Gets a param by its name.

getParamValue(paramName)

Gets the value of a parameter.

hasDefault(param)

Checks whether a param has a default value.

hasParam(paramName)

Tests whether this instance contains a param with a given (string) name.

isDefined(param)

Checks whether a param is explicitly set by user or has a default value.

isSet(param)

Checks whether a param is explicitly set by user.

load(path)

Reads an ML instance from the input path, a shortcut of read().load(path).

read()

Returns an MLReader instance for this class.

save(path)

Save this ML instance to the given path, a shortcut of 'write().save(path)'.

set(param, value)

Sets a parameter in the embedded param map.

setDefaultEntity(value)

Sets the key in the metadata dictionary that you want to filter (by default 'entity')

setGroupBySentences(value)

Sets the groupBySentences that allow split the paragraphs grouping the chunks by sentences.

setInputCols(*value)

Sets column names of input annotations.

setInsertChunk(value)

Whether to insert the chunk in the paragraph or not.

setLazyAnnotator(value)

Sets whether Annotator should be evaluated lazily in a RecursivePipeline.

setOutputCol(value)

Sets output column name of annotations.

setParamValue(paramName)

Sets the value of a parameter.

setParams()

transform(dataset[, params])

Transforms the input dataset with optional parameters.

write()

Returns an MLWriter instance for this ML instance.

-

Attributes

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

defaultEntity

getter_attrs

groupBySentences

inputCols

insertChunk

lazyAnnotator

name

outputCol

params

Returns all params ordered by name.

-
-
-clear(param)
-

Clears a param from the param map if it has been explicitly set.

-
- -
-
-copy(extra=None)
-

Creates a copy of this instance with the same uid and some -extra params. This implementation first calls Params.copy and -then make a copy of the companion Java pipeline component with -extra params. So both the Python wrapper and the Java pipeline -component get copied.

-
-
Parameters
-
-
extradict, optional

Extra parameters to copy to the new instance

-
-
-
-
Returns
-
-
JavaParams

Copy of this instance

-
-
-
-
-
- -
-
-explainParam(param)
-

Explains a single param and returns its name, doc, and optional -default value and user-supplied value in a string.

-
- -
-
-explainParams()
-

Returns the documentation of all params with their optionally -default values and user-supplied values.

-
- -
-
-extractParamMap(extra=None)
-

Extracts the embedded default param values and user-supplied -values, and then merges them with extra values from input into -a flat param map, where the latter value is used if there exist -conflicts, i.e., with ordering: default param values < -user-supplied values < extra.

-
-
Parameters
-
-
extradict, optional

extra param values

-
-
-
-
Returns
-
-
dict

merged param map

-
-
-
-
-
- -
-
-getInputCols()
-

Gets current column names of input annotations.

-
- -
-
-getLazyAnnotator()
-

Gets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
- -
-
-getOrDefault(param)
-

Gets the value of a param in the user-supplied param map or its -default value. Raises an error if neither is set.

-
- -
-
-getOutputCol()
-

Gets output column name of annotations.

-
- -
-
-getParam(paramName)
-

Gets a param by its name.

-
- -
-
-getParamValue(paramName)
-

Gets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-hasDefault(param)
-

Checks whether a param has a default value.

-
- -
-
-hasParam(paramName)
-

Tests whether this instance contains a param with a given -(string) name.

-
- -
-
-isDefined(param)
-

Checks whether a param is explicitly set by user or has -a default value.

-
- -
-
-isSet(param)
-

Checks whether a param is explicitly set by user.

-
- -
-
-classmethod load(path)
-

Reads an ML instance from the input path, a shortcut of read().load(path).

-
- -
-
-property params
-

Returns all params ordered by name. The default implementation -uses dir() to get all attributes of type -Param.

-
- -
-
-classmethod read()
-

Returns an MLReader instance for this class.

-
- -
-
-save(path)
-

Save this ML instance to the given path, a shortcut of ‘write().save(path)’.

-
- -
-
-set(param, value)
-

Sets a parameter in the embedded param map.

-
- -
-
-setDefaultEntity(value)[source]
-

Sets the key in the metadata dictionary that you want to filter (by default ‘entity’)

-
-
Parameters
-
-
valuestr

The key in the metadata dictionary that you want to filter (by default ‘entity’)

-
-
-
-
-
- -
-
-setGroupBySentences(value)[source]
-
-
Sets the groupBySentences that allow split the paragraphs grouping the chunks by sentences.

If is false we assume that we have 1 document annotations and all chunks are for this document. -Use false if the input column of your chunk annotator was a sentenceDetector column. -Use true when we have a sentence detector as input column or when the document have many sentences per row

-
-
-
-
Parameters
-
-
valueBoolean

Allow split the paragraphs grouping the chunks by sentences

-
-
-
-
-
- -
-
-setInputCols(*value)
-

Sets column names of input annotations.

-
-
Parameters
-
-
*valuestr

Input columns for the annotator

-
-
-
-
-
- -
-
-setInsertChunk(value)[source]
-

Whether to insert the chunk in the paragraph or not.

-
-
Parameters
-
-
valueBoolean

Whether to insert the chunk in the paragraph or not.

-
-
-
-
-
- -
-
-setLazyAnnotator(value)
-

Sets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
-
Parameters
-
-
valuebool

Whether Annotator should be evaluated lazily in a -RecursivePipeline

-
-
-
-
-
- -
-
-setOutputCol(value)
-

Sets output column name of annotations.

-
-
Parameters
-
-
valuestr

Name of output column

-
-
-
-
-
- -
-
-setParamValue(paramName)
-

Sets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-transform(dataset, params=None)
-

Transforms the input dataset with optional parameters.

-
-

New in version 1.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset

-
-
paramsdict, optional

an optional param map that overrides embedded params.

-
-
-
-
Returns
-
-
pyspark.sql.DataFrame

transformed dataset

-
-
-
-
-
- -
-
-uid
-

A unique id for the object.

-
- -
-
-write()
-

Returns an MLWriter instance for this ML instance.

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.CommonResolverParams.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.CommonResolverParams.html deleted file mode 100644 index d69609af75..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.CommonResolverParams.html +++ /dev/null @@ -1,1016 +0,0 @@ - - - - - - - - sparknlp_jsl.annotator.CommonResolverParams — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.annotator.CommonResolverParams

-
-
-class sparknlp_jsl.annotator.CommonResolverParams[source]
-

Bases: sparknlp.common.HasCaseSensitiveProperties

-

Class used to have a common interface Entity Resolver family.

-
-
Parameters
-
-
distanceFunction

What distance function to use for WMD: ‘EUCLIDEAN’ or ‘COSINE’.

-
-
neighbours

Number of neighbours to consider in the KNN query to calculate WMD

-
-
alternatives

Number of results to return in the metadata after sorting by last distance calculated

-
-
extramassPenalty

Penalty for extra words in the knowledge base match

-
-
threshold

Threshold value for the last distance calculated.

-
-
enableWmd

Whether or not to use WMD token distance.

-
-
enableTfidf

Whether or not to use TFIDF token distance.

-
-
enableJaccard

Whether or not to use Jaccard token distance.

-
-
enableSorensenDice

Whether or not to use Sorensen-Dice token distance.

-
-
enableJaroWinkler =

Whether or not to use Jaro-Winkler character distance.

-
-
enableLevenshtein

Whether or not to use Levenshtein character distance.

-
-
distanceWeights

Distance weights to apply before pooling: [WMD, TFIDF, Jaccard, SorensenDice, JaroWinkler, Levenshtein].

-
-
poolingStrategy

Pooling strategy to aggregate distances: AVERAGE, MIN or MAX.

-
-
confidenceFunction

What function to use to calculate confidence: INVERSE or SOFTMAX.

-
-
allDistancesMetadata

Whether or not to return an all distance values in the metadata. Default: False.

-
-
missAsEmpty

Whether or not to return an empty annotation on unmatched chunks.

-
-
-
-
-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__(*args, **kwargs)

getCaseSensitive()

Gets whether to ignore case in tokens for embeddings matching.

setAllDistancesMetadata(s)

Sets whether or not to return an all distance values in the metadata.

setAlternatives(a)

Sets number of results to return in the metadata after sorting by last distance calculated.

setCaseSensitive(value)

Sets whether to ignore case in tokens for embeddings matching.

setConfidenceFunction(s)

What function to use to calculate confidence: INVERSE or SOFTMAX.

setDistanceFunction(dist)

Sets distance function to use for WMD: 'EUCLIDEAN' or 'COSINE'.

setDistanceWeights(l)

Sets distance weights to apply before pooling: [WMD, TFIDF, Jaccard, SorensenDice, JaroWinkler, Levenshtein].

setEnableJaccard(e)

Sets whether or not to use Jaccard token distance.

setEnableJaroWinkler(e)

Whether or not to use Jaro-Winkler token distance.

setEnableLevenshtein(e)

Sets whether or not to use Levenshtein token distance.

setEnableSorensenDice(e)

Sets whether or not to use Sorensen-Dice token distance.

setEnableTfidf(e)

Sets whether or not to use TFIDF token distance.

setEnableWmd(e)

Sets whether or not to use WMD token distance.

setExtramassPenalty(emp)

Sets penalty for extra words in the knowledge base match.

setMissAsEmpty(value)

Sets whether or not to return an empty annotation on unmatched chunks

setNeighbours(k)

Sets number of neighbours to consider in the KNN query to calculate WMD.

setPoolingStrategy(s)

Sets pooling strategy to aggregate distances: AVERAGE, MIN or MAX.

setThreshold(thres)

Sets Threshold value for the last distance calculated.

-

Attributes

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

allDistancesMetadata

alternatives

caseSensitive

confidenceFunction

distanceFunction

distanceWeights

enableJaccard

enableJaroWinkler

enableLevenshtein

enableSorensenDice

enableTfidf

enableWmd

extramassPenalty

missAsEmpty

neighbours

poolingStrategy

threshold

-
-
-getCaseSensitive()
-

Gets whether to ignore case in tokens for embeddings matching.

-
-
Returns
-
-
bool

Whether to ignore case in tokens for embeddings matching

-
-
-
-
-
- -
-
-setAllDistancesMetadata(s)[source]
-

Sets whether or not to return an all distance values in the metadata. Default: False.

-
-
Parameters
-
-
sbool

whether or not to return an all distance values in the metadata. Default: False.

-
-
-
-
-
- -
-
-setAlternatives(a)[source]
-

Sets number of results to return in the metadata after sorting by last distance calculated.

-
-
Parameters
-
-
aint

Number of results to return in the metadata after sorting by last distance calculated.

-
-
-
-
-
- -
-
-setCaseSensitive(value)
-

Sets whether to ignore case in tokens for embeddings matching.

-
-
Parameters
-
-
valuebool

Whether to ignore case in tokens for embeddings matching

-
-
-
-
-
- -
-
-setConfidenceFunction(s)[source]
-

What function to use to calculate confidence: INVERSE or SOFTMAX.

-
-
Parameters
-
-
sstr

What function to use to calculate confidence: INVERSE or SOFTMAX.

-
-
-
-
-
- -
-
-setDistanceFunction(dist)[source]
-

Sets distance function to use for WMD: ‘EUCLIDEAN’ or ‘COSINE’.

-
-
Parameters
-
-
diststr

Value that selects what distance function to use for WMD: ‘EUCLIDEAN’ or ‘COSINE’.

-
-
-
-
-
- -
-
-setDistanceWeights(l)[source]
-

Sets distance weights to apply before pooling: [WMD, TFIDF, Jaccard, SorensenDice, JaroWinkler, Levenshtein].

-
-
Parameters
-
-
lstr

Whether or not to use Jaro-Winkler token distance.

-
-
-
-
-
- -
-
-setEnableJaccard(e)[source]
-

Sets whether or not to use Jaccard token distance.

-
-
Parameters
-
-
ebool
-
Whether or not to use Jaccard token distance.
-
-
-
-
- -
-
-setEnableJaroWinkler(e)[source]
-

Whether or not to use Jaro-Winkler token distance.

-
-
Parameters
-
-
ebool

Whether or not to use Jaro-Winkler token distance.

-
-
-
-
-
- -
-
-setEnableLevenshtein(e)[source]
-

Sets whether or not to use Levenshtein token distance.

-
-
Parameters
-
-
ebool

Whether or not to use Levenshtein token distance.

-
-
-
-
-
- -
-
-setEnableSorensenDice(e)[source]
-

Sets whether or not to use Sorensen-Dice token distance.

-
-
Parameters
-
-
ebool

Whether or not to use Sorensen-Dice token distance.

-
-
-
-
-
- -
-
-setEnableTfidf(e)[source]
-

Sets whether or not to use TFIDF token distance.

-
-
Parameters
-
-
pbool

Whether or not to use TFIDF token distance.

-
-
-
-
-
- -
-
-setEnableWmd(e)[source]
-

Sets whether or not to use WMD token distance.

-
-
Parameters
-
-
ebool

Whether or not to use WMD token distance.

-
-
-
-
-
- -
-
-setExtramassPenalty(emp)[source]
-

Sets penalty for extra words in the knowledge base match.

-
-
Parameters
-
-
empfloat

Penalty for extra words in the knowledge base match.

-
-
-
-
-
- -
-
-setMissAsEmpty(value)[source]
-

Sets whether or not to return an empty annotation on unmatched chunks

-
-
Parameters
-
-
sbool

whether or not to return an empty annotation on unmatched chunks

-
-
-
-
-
- -
-
-setNeighbours(k)[source]
-

Sets number of neighbours to consider in the KNN query to calculate WMD.

-
-
Parameters
-
-
kint

Number of neighbours to consider in the KNN query to calculate WMD.

-
-
-
-
-
- -
-
-setPoolingStrategy(s)[source]
-

Sets pooling strategy to aggregate distances: AVERAGE, MIN or MAX.

-
-
Parameters
-
-
sstr

Pooling strategy to aggregate distances: AVERAGE, MIN or MAX.

-
-
-
-
-
- -
-
-setThreshold(thres)[source]
-

Sets Threshold value for the last distance calculated.

-
-
Parameters
-
-
thresfloat

Threshold value for the last distance calculated.

-
-
-
-
-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.ContextualParserApproach.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.ContextualParserApproach.html deleted file mode 100644 index c29840aa6f..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.ContextualParserApproach.html +++ /dev/null @@ -1,1179 +0,0 @@ - - - - - - - - sparknlp_jsl.annotator.ContextualParserApproach — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.annotator.ContextualParserApproach

-
-
-class sparknlp_jsl.annotator.ContextualParserApproach[source]
-

Bases: sparknlp.common.AnnotatorApproach

-

Creates a model, that extracts entity from a document based on user defined rules. -Rule matching is based on a RegexMatcher defined in a JSON file. It is set through the parameter setJsonPath() -In this JSON file, regex is defined that you want to match along with the information that will output on metadata -field. Additionally, a dictionary can be provided with setDictionary to map extracted entities -to a unified representation. The first column of the dictionary file should be the representation with following -columns the possible matches.

- ---- - - - - - - - - - - -

Input Annotation types

Output Annotation type

DOCUMENT, TOKEN

CHUNK

-
-
Parameters
-
-
jsonPath

Path to json file with rules

-
-
caseSensitive

Whether to use case sensitive when matching values

-
-
prefixAndSuffixMatch

Whether to match both prefix and suffix to annotate the hit

-
-
dictionary

Path to dictionary file in tsv or csv format

-
-
-
-
-

Examples

-
>>> import sparknlp
->>> from sparknlp.base import *
->>> from sparknlp.common import *
->>> from sparknlp.annotator import *
->>> from sparknlp.training import *
->>> import sparknlp_jsl
->>> from sparknlp_jsl.base import *
->>> from sparknlp_jsl.annotator import *
->>> from pyspark.ml import Pipeline
-
-
-
>>> documentAssembler = DocumentAssembler()     ...   .setInputCol("text")     ...   .setOutputCol("document")
-...
->>> sentenceDetector = SentenceDetector()     ...   .setInputCols(["document"])     ...   .setOutputCol("sentence")
-...
->>> tokenizer = Tokenizer()     ...   .setInputCols(["sentence"])     ...   .setOutputCol("token")
-
-
-

Define the parser (json file needs to be provided)

-
>>> data = spark.createDataFrame([["A patient has liver metastases pT1bN0M0 and the T5 primary site may be colon or... "]]).toDF("text")
->>> contextualParser = ContextualParserApproach()     ...   .setInputCols(["sentence", "token"])     ...   .setOutputCol("entity")     ...   .setJsonPath("/path/to/regex_token.json")     ...   .setCaseSensitive(True)
-...
->>> pipeline = Pipeline(stages=[
-...     documentAssembler,
-...     sentenceDetector,
-...     tokenizer,
-...     contextualParser
-...   ])
-
-
-
>>> result = pipeline.fit(data).transform(data)
->>> result.selectExpr("explode(entity)").show(5, truncate=False)
-
-
- --- - - - - - - -

col

{chunk, 32, 39, pT1bN0M0, {field -> Stage, normalized -> , confidenceValue -> 1.00, sentence -> 0}, []} -{chunk, 49, 50, T5, {field -> Stage, normalized -> , confidenceValue -> 1.00, sentence -> 0}, []} -{chunk, 148, 156, cT4bcN2M1, {field -> Stage, normalized -> , confidenceValue -> 1.00, sentence -> 1}, []} -{chunk, 189, 194, T?N3M1, {field -> Stage, normalized -> , confidenceValue -> 1.00, sentence -> 2}, []} -{chunk, 316, 323, pT1bN0M0, {field -> Stage, normalized -> , confidenceValue -> 1.00, sentence -> 3}, []}

-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__()

clear(param)

Clears a param from the param map if it has been explicitly set.

copy([extra])

Creates a copy of this instance with the same uid and some extra params.

explainParam(param)

Explains a single param and returns its name, doc, and optional default value and user-supplied value in a string.

explainParams()

Returns the documentation of all params with their optionally default values and user-supplied values.

extractParamMap([extra])

Extracts the embedded default param values and user-supplied values, and then merges them with extra values from input into a flat param map, where the latter value is used if there exist conflicts, i.e., with ordering: default param values < user-supplied values < extra.

fit(dataset[, params])

Fits a model to the input dataset with optional parameters.

fitMultiple(dataset, paramMaps)

Fits a model to the input dataset for each param map in paramMaps.

getInputCols()

Gets current column names of input annotations.

getLazyAnnotator()

Gets whether Annotator should be evaluated lazily in a RecursivePipeline.

getOrDefault(param)

Gets the value of a param in the user-supplied param map or its default value.

getOutputCol()

Gets output column name of annotations.

getParam(paramName)

Gets a param by its name.

getParamValue(paramName)

Gets the value of a parameter.

hasDefault(param)

Checks whether a param has a default value.

hasParam(paramName)

Tests whether this instance contains a param with a given (string) name.

isDefined(param)

Checks whether a param is explicitly set by user or has a default value.

isSet(param)

Checks whether a param is explicitly set by user.

load(path)

Reads an ML instance from the input path, a shortcut of read().load(path).

read()

Returns an MLReader instance for this class.

save(path)

Save this ML instance to the given path, a shortcut of 'write().save(path)'.

set(param, value)

Sets a parameter in the embedded param map.

setCaseSensitive(value)

Sets whether to use case sensitive when matching values

setDictionary(path[, read_as, options])

Sets if we want to use 'bow' for word embeddings or 'sentence' for sentences"

setInputCols(*value)

Sets column names of input annotations.

setJsonPath(value)

Sets path to json file with rules

setLazyAnnotator(value)

Sets whether Annotator should be evaluated lazily in a RecursivePipeline.

setOutputCol(value)

Sets output column name of annotations.

setParamValue(paramName)

Sets the value of a parameter.

setPrefixAndSuffixMatch(value)

Sets whether to match both prefix and suffix to annotate the hit

write()

Returns an MLWriter instance for this ML instance.

-

Attributes

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

caseSensitive

dictionary

getter_attrs

inputCols

jsonPath

lazyAnnotator

outputCol

params

Returns all params ordered by name.

prefixAndSuffixMatch

-
-
-clear(param)
-

Clears a param from the param map if it has been explicitly set.

-
- -
-
-copy(extra=None)
-

Creates a copy of this instance with the same uid and some -extra params. This implementation first calls Params.copy and -then make a copy of the companion Java pipeline component with -extra params. So both the Python wrapper and the Java pipeline -component get copied.

-
-
Parameters
-
-
extradict, optional

Extra parameters to copy to the new instance

-
-
-
-
Returns
-
-
JavaParams

Copy of this instance

-
-
-
-
-
- -
-
-explainParam(param)
-

Explains a single param and returns its name, doc, and optional -default value and user-supplied value in a string.

-
- -
-
-explainParams()
-

Returns the documentation of all params with their optionally -default values and user-supplied values.

-
- -
-
-extractParamMap(extra=None)
-

Extracts the embedded default param values and user-supplied -values, and then merges them with extra values from input into -a flat param map, where the latter value is used if there exist -conflicts, i.e., with ordering: default param values < -user-supplied values < extra.

-
-
Parameters
-
-
extradict, optional

extra param values

-
-
-
-
Returns
-
-
dict

merged param map

-
-
-
-
-
- -
-
-fit(dataset, params=None)
-

Fits a model to the input dataset with optional parameters.

-
-

New in version 1.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset.

-
-
paramsdict or list or tuple, optional

an optional param map that overrides embedded params. If a list/tuple of -param maps is given, this calls fit on each param map and returns a list of -models.

-
-
-
-
Returns
-
-
Transformer or a list of Transformer

fitted model(s)

-
-
-
-
-
- -
-
-fitMultiple(dataset, paramMaps)
-

Fits a model to the input dataset for each param map in paramMaps.

-
-

New in version 2.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset.

-
-
paramMapscollections.abc.Sequence

A Sequence of param maps.

-
-
-
-
Returns
-
-
_FitMultipleIterator

A thread safe iterable which contains one model for each param map. Each -call to next(modelIterator) will return (index, model) where model was fit -using paramMaps[index]. index values may not be sequential.

-
-
-
-
-
- -
-
-getInputCols()
-

Gets current column names of input annotations.

-
- -
-
-getLazyAnnotator()
-

Gets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
- -
-
-getOrDefault(param)
-

Gets the value of a param in the user-supplied param map or its -default value. Raises an error if neither is set.

-
- -
-
-getOutputCol()
-

Gets output column name of annotations.

-
- -
-
-getParam(paramName)
-

Gets a param by its name.

-
- -
-
-getParamValue(paramName)
-

Gets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-hasDefault(param)
-

Checks whether a param has a default value.

-
- -
-
-hasParam(paramName)
-

Tests whether this instance contains a param with a given -(string) name.

-
- -
-
-isDefined(param)
-

Checks whether a param is explicitly set by user or has -a default value.

-
- -
-
-isSet(param)
-

Checks whether a param is explicitly set by user.

-
- -
-
-classmethod load(path)
-

Reads an ML instance from the input path, a shortcut of read().load(path).

-
- -
-
-property params
-

Returns all params ordered by name. The default implementation -uses dir() to get all attributes of type -Param.

-
- -
-
-classmethod read()
-

Returns an MLReader instance for this class.

-
- -
-
-save(path)
-

Save this ML instance to the given path, a shortcut of ‘write().save(path)’.

-
- -
-
-set(param, value)
-

Sets a parameter in the embedded param map.

-
- -
-
-setCaseSensitive(value)[source]
-

Sets whether to use case sensitive when matching values

-
-
Parameters
-
-
valuebool

Whether to use case sensitive when matching values

-
-
-
-
-
- -
-
-setDictionary(path, read_as='TEXT', options={'delimiter': '\t'})[source]
-

Sets if we want to use ‘bow’ for word embeddings or ‘sentence’ for sentences”

-
-
Parameters
-
-
pathstr

Path wher is de dictionary

-
-
read_as: ReadAs

Format of the file

-
-
options: dict

Dictionary with the options to read the file.

-
-
-
-
-
- -
-
-setInputCols(*value)
-

Sets column names of input annotations.

-
-
Parameters
-
-
*valuestr

Input columns for the annotator

-
-
-
-
-
- -
-
-setJsonPath(value)[source]
-

Sets path to json file with rules

-
-
Parameters
-
-
valuestr

Path to json file with rules

-
-
-
-
-
- -
-
-setLazyAnnotator(value)
-

Sets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
-
Parameters
-
-
valuebool

Whether Annotator should be evaluated lazily in a -RecursivePipeline

-
-
-
-
-
- -
-
-setOutputCol(value)
-

Sets output column name of annotations.

-
-
Parameters
-
-
valuestr

Name of output column

-
-
-
-
-
- -
-
-setParamValue(paramName)
-

Sets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-setPrefixAndSuffixMatch(value)[source]
-

Sets whether to match both prefix and suffix to annotate the hit

-
-
Parameters
-
-
valuebool

Whether to match both prefix and suffix to annotate the hit

-
-
-
-
-
- -
-
-uid
-

A unique id for the object.

-
- -
-
-write()
-

Returns an MLWriter instance for this ML instance.

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.ContextualParserModel.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.ContextualParserModel.html deleted file mode 100644 index ad40825adc..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.ContextualParserModel.html +++ /dev/null @@ -1,1122 +0,0 @@ - - - - - - - - sparknlp_jsl.annotator.ContextualParserModel — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.annotator.ContextualParserModel

-
-
-class sparknlp_jsl.annotator.ContextualParserModel(classname='com.johnsnowlabs.nlp.annotators.context.ContextualParserModel', java_model=None)[source]
-

Bases: sparknlp.common.AnnotatorModel

-

Extracts entity from a document based on user defined rules. Rule matching is based on a RegexMatcher defined in a -JSON file. In this file, regex is defined that you want to match along with the information that will output on -metadata field..

- ---- - - - - - - - - - - -

Input Annotation types

Output Annotation type

DOCUMENT, TOKEN

CHUNK

-
-
Parameters
-
-
jsonPath

Path to json file with rules

-
-
caseSensitive

Whether to use case sensitive when matching values

-
-
prefixAndSuffixMatch

Whether to match both prefix and suffix to annotate the hit

-
-
-
-
-

Examples

-
>>> import sparknlp
->>> from sparknlp.base import *
->>> from sparknlp.common import *
->>> from sparknlp.annotator import *
->>> from sparknlp.training import *
->>> import sparknlp_jsl
->>> from sparknlp_jsl.base import *
->>> from sparknlp_jsl.annotator import *
->>> from pyspark.ml import Pipeline
-
-
-

Which means to extract the stage code on a sentence level. -An example pipeline could then be defined like this -Pipeline could then be defined like this

-
>>> documentAssembler = DocumentAssembler()     ...   .setInputCol("text")     ...   .setOutputCol("document")
-...
->>> sentenceDetector = SentenceDetector()     ...   .setInputCols(["document"])     ...   .setOutputCol("sentence")
-...
->>> tokenizer = Tokenizer()     ...   .setInputCols(["sentence"])     ...   .setOutputCol("token")
-
-
-
>>> data = spark.createDataFrame([["A patient has liver metastases pT1bN0M0 and the T5 primary site may be colon or... "]]).toDF("text")
->>> contextualParser = ContextualParserModel.load("mycontextualParserModel")     ...   .setInputCols(["sentence", "token"])     ...   .setOutputCol("entity")     ...
->>> pipeline = Pipeline(stages=[
-...     documentAssembler,
-...     sentenceDetector,
-...     tokenizer,
-...     contextualParser
-...   ])
-
-
-
>>> result = pipeline.fit(data).transform(data)
->>> result.selectExpr("explode(entity)").show(5, truncate=False)
-
-
- --- - - - - - - -

col

{chunk, 32, 39, pT1bN0M0, {field -> Stage, normalized -> , confidenceValue -> 1.00, sentence -> 0}, []} -{chunk, 49, 50, T5, {field -> Stage, normalized -> , confidenceValue -> 1.00, sentence -> 0}, []} -{chunk, 148, 156, cT4bcN2M1, {field -> Stage, normalized -> , confidenceValue -> 1.00, sentence -> 1}, []} -{chunk, 189, 194, T?N3M1, {field -> Stage, normalized -> , confidenceValue -> 1.00, sentence -> 2}, []} -{chunk, 316, 323, pT1bN0M0, {field -> Stage, normalized -> , confidenceValue -> 1.00, sentence -> 3}, []}

-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__([classname, java_model])

Initialize this instance with a Java model object.

clear(param)

Clears a param from the param map if it has been explicitly set.

copy([extra])

Creates a copy of this instance with the same uid and some extra params.

explainParam(param)

Explains a single param and returns its name, doc, and optional default value and user-supplied value in a string.

explainParams()

Returns the documentation of all params with their optionally default values and user-supplied values.

extractParamMap([extra])

Extracts the embedded default param values and user-supplied values, and then merges them with extra values from input into a flat param map, where the latter value is used if there exist conflicts, i.e., with ordering: default param values < user-supplied values < extra.

getInputCols()

Gets current column names of input annotations.

getLazyAnnotator()

Gets whether Annotator should be evaluated lazily in a RecursivePipeline.

getOrDefault(param)

Gets the value of a param in the user-supplied param map or its default value.

getOutputCol()

Gets output column name of annotations.

getParam(paramName)

Gets a param by its name.

getParamValue(paramName)

Gets the value of a parameter.

hasDefault(param)

Checks whether a param has a default value.

hasParam(paramName)

Tests whether this instance contains a param with a given (string) name.

isDefined(param)

Checks whether a param is explicitly set by user or has a default value.

isSet(param)

Checks whether a param is explicitly set by user.

load(path)

Reads an ML instance from the input path, a shortcut of read().load(path).

read()

Returns an MLReader instance for this class.

save(path)

Save this ML instance to the given path, a shortcut of 'write().save(path)'.

set(param, value)

Sets a parameter in the embedded param map.

setCaseSensitive(value)

Sets whether to use case sensitive when matching values

setInputCols(*value)

Sets column names of input annotations.

setJsonPath(value)

Sets path to json file with rules

setLazyAnnotator(value)

Sets whether Annotator should be evaluated lazily in a RecursivePipeline.

setOutputCol(value)

Sets output column name of annotations.

setParamValue(paramName)

Sets the value of a parameter.

setParams()

setPrefixAndSuffixMatch(value)

Sets whether to match both prefix and suffix to annotate the hit

transform(dataset[, params])

Transforms the input dataset with optional parameters.

write()

Returns an MLWriter instance for this ML instance.

-

Attributes

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - -

caseSensitive

getter_attrs

inputCols

jsonPath

lazyAnnotator

outputCol

params

Returns all params ordered by name.

prefixAndSuffixMatch

-
-
-clear(param)
-

Clears a param from the param map if it has been explicitly set.

-
- -
-
-copy(extra=None)
-

Creates a copy of this instance with the same uid and some -extra params. This implementation first calls Params.copy and -then make a copy of the companion Java pipeline component with -extra params. So both the Python wrapper and the Java pipeline -component get copied.

-
-
Parameters
-
-
extradict, optional

Extra parameters to copy to the new instance

-
-
-
-
Returns
-
-
JavaParams

Copy of this instance

-
-
-
-
-
- -
-
-explainParam(param)
-

Explains a single param and returns its name, doc, and optional -default value and user-supplied value in a string.

-
- -
-
-explainParams()
-

Returns the documentation of all params with their optionally -default values and user-supplied values.

-
- -
-
-extractParamMap(extra=None)
-

Extracts the embedded default param values and user-supplied -values, and then merges them with extra values from input into -a flat param map, where the latter value is used if there exist -conflicts, i.e., with ordering: default param values < -user-supplied values < extra.

-
-
Parameters
-
-
extradict, optional

extra param values

-
-
-
-
Returns
-
-
dict

merged param map

-
-
-
-
-
- -
-
-getInputCols()
-

Gets current column names of input annotations.

-
- -
-
-getLazyAnnotator()
-

Gets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
- -
-
-getOrDefault(param)
-

Gets the value of a param in the user-supplied param map or its -default value. Raises an error if neither is set.

-
- -
-
-getOutputCol()
-

Gets output column name of annotations.

-
- -
-
-getParam(paramName)
-

Gets a param by its name.

-
- -
-
-getParamValue(paramName)
-

Gets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-hasDefault(param)
-

Checks whether a param has a default value.

-
- -
-
-hasParam(paramName)
-

Tests whether this instance contains a param with a given -(string) name.

-
- -
-
-isDefined(param)
-

Checks whether a param is explicitly set by user or has -a default value.

-
- -
-
-isSet(param)
-

Checks whether a param is explicitly set by user.

-
- -
-
-classmethod load(path)
-

Reads an ML instance from the input path, a shortcut of read().load(path).

-
- -
-
-property params
-

Returns all params ordered by name. The default implementation -uses dir() to get all attributes of type -Param.

-
- -
-
-classmethod read()
-

Returns an MLReader instance for this class.

-
- -
-
-save(path)
-

Save this ML instance to the given path, a shortcut of ‘write().save(path)’.

-
- -
-
-set(param, value)
-

Sets a parameter in the embedded param map.

-
- -
-
-setCaseSensitive(value)[source]
-

Sets whether to use case sensitive when matching values

-
-
Parameters
-
-
valuebool

Whether to use case sensitive when matching values

-
-
-
-
-
- -
-
-setInputCols(*value)
-

Sets column names of input annotations.

-
-
Parameters
-
-
*valuestr

Input columns for the annotator

-
-
-
-
-
- -
-
-setJsonPath(value)[source]
-

Sets path to json file with rules

-
-
Parameters
-
-
valuestr

Path to json file with rules

-
-
-
-
-
- -
-
-setLazyAnnotator(value)
-

Sets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
-
Parameters
-
-
valuebool

Whether Annotator should be evaluated lazily in a -RecursivePipeline

-
-
-
-
-
- -
-
-setOutputCol(value)
-

Sets output column name of annotations.

-
-
Parameters
-
-
valuestr

Name of output column

-
-
-
-
-
- -
-
-setParamValue(paramName)
-

Sets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-setPrefixAndSuffixMatch(value)[source]
-

Sets whether to match both prefix and suffix to annotate the hit

-
-
Parameters
-
-
valuebool

Whether to match both prefix and suffix to annotate the hit

-
-
-
-
-
- -
-
-transform(dataset, params=None)
-

Transforms the input dataset with optional parameters.

-
-

New in version 1.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset

-
-
paramsdict, optional

an optional param map that overrides embedded params.

-
-
-
-
Returns
-
-
pyspark.sql.DataFrame

transformed dataset

-
-
-
-
-
- -
-
-uid
-

A unique id for the object.

-
- -
-
-write()
-

Returns an MLWriter instance for this ML instance.

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.DateNormalizer.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.DateNormalizer.html deleted file mode 100644 index 28a0cbbd1c..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.DateNormalizer.html +++ /dev/null @@ -1,1115 +0,0 @@ - - - - - - - - sparknlp_jsl.annotator.DateNormalizer — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.annotator.DateNormalizer

-
-
-class sparknlp_jsl.annotator.DateNormalizer(classname='com.johnsnowlabs.nlp.annotators.normalizer.DateNormalizer', java_model=None)[source]
-

Bases: sparknlp.common.AnnotatorModel

-

Try to normalize dates in chunks annotations. -The expected format for the date will be YYYY/MM/DD. -If the date is normalized then field normalized in metadata will be true else will be false.

- ---- - - - - - - - - - - -

Input Annotation types

Output Annotation type

CHUNK

CHUNK

-
-
Parameters
-
-
anchorDateYear

Add an anchor year for the relative dates such as a day after tomorrow. -If not set it will use the current year. Example: 2021

-
-
anchorDateMonth

Add an anchor month for the relative dates such as a day after tomorrow. -If not set it will use the current month. Example: 1 which means January

-
-
anchorDateDay

Add an anchor day of the day for the relative dates such as a day after -tomorrow. If not set it will use the current day. Example: 11

-
-
-
-
-

Examples

-
>>> import sparknlp
->>> from sparknlp.base import *
->>> from sparknlp.common import *
->>> from sparknlp.annotator import *
->>> from sparknlp.training import *
->>> import sparknlp_jsl
->>> from sparknlp_jsl.base import *
->>> from sparknlp_jsl.annotator import *
->>> from pyspark.ml import Pipeline
->>>document_assembler = DocumentAssembler().setInputCol('ner_chunk').setOutputCol('document')
->>>chunksDF = document_assembler.transform(df)
->>>aa = map_annotations_col(chunksDF.select("document"),
-...                    lambda x: [Annotation('chunk', a.begin, a.end, a.result, a.metadata, a.embeddings) for a in x], "document",
-...                    "chunk_date", "chunk")
->>>dateNormalizer = DateNormalizer().setInputCols('chunk_date').setOutputCol('date').setAnchorDateYear(2000).setAnchorDateMonth(3).setAnchorDateDay(15)
->>> result = dateNormalizer.transform(aa)
->>> data = spark.createDataFrame([["Fri, 21 Nov 1997"], ["next week at 7.30"], ["see you a day after"]]).toDF("text")
->>> result = pipeline.fit(data).transform(data)
->>> result.selectExpr("date.result","text")
-+-------------+-----------+
-|       result|       text|
-+-------------+-----------+
-| [08/02/2018]| 08/02/2018|
-|    [11/2018]|    11/2018|
-| [11/01/2018]| 11/01/2018|
-|[next monday]|next monday|
-|      [today]|      today|
-|  [next week]|  next week|
-+-------------+-----------+
-
-
-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__([classname, java_model])

Initialize this instance with a Java model object.

clear(param)

Clears a param from the param map if it has been explicitly set.

copy([extra])

Creates a copy of this instance with the same uid and some extra params.

explainParam(param)

Explains a single param and returns its name, doc, and optional default value and user-supplied value in a string.

explainParams()

Returns the documentation of all params with their optionally default values and user-supplied values.

extractParamMap([extra])

Extracts the embedded default param values and user-supplied values, and then merges them with extra values from input into a flat param map, where the latter value is used if there exist conflicts, i.e., with ordering: default param values < user-supplied values < extra.

getInputCols()

Gets current column names of input annotations.

getLazyAnnotator()

Gets whether Annotator should be evaluated lazily in a RecursivePipeline.

getOrDefault(param)

Gets the value of a param in the user-supplied param map or its default value.

getOutputCol()

Gets output column name of annotations.

getParam(paramName)

Gets a param by its name.

getParamValue(paramName)

Gets the value of a parameter.

hasDefault(param)

Checks whether a param has a default value.

hasParam(paramName)

Tests whether this instance contains a param with a given (string) name.

isDefined(param)

Checks whether a param is explicitly set by user or has a default value.

isSet(param)

Checks whether a param is explicitly set by user.

load(path)

Reads an ML instance from the input path, a shortcut of read().load(path).

read()

Returns an MLReader instance for this class.

save(path)

Save this ML instance to the given path, a shortcut of 'write().save(path)'.

set(param, value)

Sets a parameter in the embedded param map.

setAnchorDateDay(value)

Sets an anchor day of the day for the relative dates such as a day after tomorrow.

setAnchorDateMonth(value)

Sets an anchor month for the relative dates such as a day after tomorrow.

setAnchorDateYear(value)

Sets an anchor year for the relative dates such as a day after tomorrow.

setInputCols(*value)

Sets column names of input annotations.

setLazyAnnotator(value)

Sets whether Annotator should be evaluated lazily in a RecursivePipeline.

setOutputCol(value)

Sets output column name of annotations.

setParamValue(paramName)

Sets the value of a parameter.

setParams()

transform(dataset[, params])

Transforms the input dataset with optional parameters.

write()

Returns an MLWriter instance for this ML instance.

-

Attributes

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

anchorDateDay

anchorDateMonth

anchorDateYear

getter_attrs

inputCols

lazyAnnotator

name

outputCol

params

Returns all params ordered by name.

-
-
-clear(param)
-

Clears a param from the param map if it has been explicitly set.

-
- -
-
-copy(extra=None)
-

Creates a copy of this instance with the same uid and some -extra params. This implementation first calls Params.copy and -then make a copy of the companion Java pipeline component with -extra params. So both the Python wrapper and the Java pipeline -component get copied.

-
-
Parameters
-
-
extradict, optional

Extra parameters to copy to the new instance

-
-
-
-
Returns
-
-
JavaParams

Copy of this instance

-
-
-
-
-
- -
-
-explainParam(param)
-

Explains a single param and returns its name, doc, and optional -default value and user-supplied value in a string.

-
- -
-
-explainParams()
-

Returns the documentation of all params with their optionally -default values and user-supplied values.

-
- -
-
-extractParamMap(extra=None)
-

Extracts the embedded default param values and user-supplied -values, and then merges them with extra values from input into -a flat param map, where the latter value is used if there exist -conflicts, i.e., with ordering: default param values < -user-supplied values < extra.

-
-
Parameters
-
-
extradict, optional

extra param values

-
-
-
-
Returns
-
-
dict

merged param map

-
-
-
-
-
- -
-
-getInputCols()
-

Gets current column names of input annotations.

-
- -
-
-getLazyAnnotator()
-

Gets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
- -
-
-getOrDefault(param)
-

Gets the value of a param in the user-supplied param map or its -default value. Raises an error if neither is set.

-
- -
-
-getOutputCol()
-

Gets output column name of annotations.

-
- -
-
-getParam(paramName)
-

Gets a param by its name.

-
- -
-
-getParamValue(paramName)
-

Gets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-hasDefault(param)
-

Checks whether a param has a default value.

-
- -
-
-hasParam(paramName)
-

Tests whether this instance contains a param with a given -(string) name.

-
- -
-
-isDefined(param)
-

Checks whether a param is explicitly set by user or has -a default value.

-
- -
-
-isSet(param)
-

Checks whether a param is explicitly set by user.

-
- -
-
-classmethod load(path)
-

Reads an ML instance from the input path, a shortcut of read().load(path).

-
- -
-
-property params
-

Returns all params ordered by name. The default implementation -uses dir() to get all attributes of type -Param.

-
- -
-
-classmethod read()
-

Returns an MLReader instance for this class.

-
- -
-
-save(path)
-

Save this ML instance to the given path, a shortcut of ‘write().save(path)’.

-
- -
-
-set(param, value)
-

Sets a parameter in the embedded param map.

-
- -
-
-setAnchorDateDay(value)[source]
-

Sets an anchor day of the day for the relative dates such as a day -after tomorrow. If not set it will use the current day.

-

Example: 11

-
-
Parameters
-
-
valueint

The anchor day for relative dates

-
-
-
-
-
- -
-
-setAnchorDateMonth(value)[source]
-

Sets an anchor month for the relative dates such as a day after -tomorrow. If not set it will use the current month.

-

Example: 1 which means January

-
-
Parameters
-
-
valueint

The anchor month for relative dates

-
-
-
-
-
- -
-
-setAnchorDateYear(value)[source]
-

Sets an anchor year for the relative dates such as a day after -tomorrow. If not set it will use the current year.

-

Example: 2021

-
-
Parameters
-
-
valueint

The anchor year for relative dates

-
-
-
-
-
- -
-
-setInputCols(*value)
-

Sets column names of input annotations.

-
-
Parameters
-
-
*valuestr

Input columns for the annotator

-
-
-
-
-
- -
-
-setLazyAnnotator(value)
-

Sets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
-
Parameters
-
-
valuebool

Whether Annotator should be evaluated lazily in a -RecursivePipeline

-
-
-
-
-
- -
-
-setOutputCol(value)
-

Sets output column name of annotations.

-
-
Parameters
-
-
valuestr

Name of output column

-
-
-
-
-
- -
-
-setParamValue(paramName)
-

Sets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-transform(dataset, params=None)
-

Transforms the input dataset with optional parameters.

-
-

New in version 1.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset

-
-
paramsdict, optional

an optional param map that overrides embedded params.

-
-
-
-
Returns
-
-
pyspark.sql.DataFrame

transformed dataset

-
-
-
-
-
- -
-
-uid
-

A unique id for the object.

-
- -
-
-write()
-

Returns an MLWriter instance for this ML instance.

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.DeIdentification.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.DeIdentification.html deleted file mode 100644 index adcfe9f79a..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.DeIdentification.html +++ /dev/null @@ -1,1653 +0,0 @@ - - - - - - - - sparknlp_jsl.annotator.DeIdentification — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.annotator.DeIdentification

-
-
-class sparknlp_jsl.annotator.DeIdentification[source]
-

Bases: sparknlp.common.AnnotatorApproach

-

Contains all the methods for training a DeIdentificationModel model. -This module can obfuscate or mask the entities that contains personal information. These can be set with a file of -regex patterns with setRegexPatternsDictionary, where each line is a mapping of entity to regex

- ---- - - - - - - - - - - -

Input Annotation types

Output Annotation type

DOCUMENT, CHUNK, TOKEN

DOCUMENT

-
-
Parameters
-
-
regexPatternsDictionary

ictionary with regular expression patterns that match some protected entity

-
-
mode

Mode for Anonimizer [‘mask’|’obfuscate’]

-
-
obfuscateDate

When mode==’obfuscate’ whether to obfuscate dates or not. This param helps in consistency to make dateFormats more visible. When setting to true, make sure dateFormats param fits the needs (default: false)

-
-
obfuscateRefFile

File with the terms to be used for Obfuscation

-
-
refFileFormat

Format of the reference file

-
-
refSep

Sep character in refFile

-
-
dateTag

Tag representing dates in the obfuscate reference file (default: DATE)

-
-
days

Number of days to obfuscate the dates by displacement. If not provided a random integer between 1 and 60 will be used

-
-
dateToYear

True if we want the model to transform dates into years, False otherwise.

-
-
minYear

Minimum year to be used when transforming dates into years.

-
-
dateFormats

List of date formats to automatically displace if parsed

-
-
consistentObfuscation

Whether to replace very similar entities in a document with the same randomized term (default: true) -The similarity is based on the Levenshtein Distance between the words.

-
-
sameEntityThreshold

Similarity threshold [0.0-1.0] to consider two appearances of an entity as the same (default: 0.9).

-
-
obfuscateRefSource

The source of obfuscation of to obfuscate the entities.For dates entities doesnt apply tha method. -The values ar the following: -file: Takes the entities from the obfuscatorRefFile -faker: Takes the entities from the Faker module -both : Takes the entities from the obfuscatorRefFile and the faker module randomly.

-
-
regexOverride

If is true prioritize the regex entities, if is false prioritize the ner.

-
-
seed

It is the seed to select the entities on obfuscate mode.With the seed you can reply a execution several times with the same ouptut.

-
-
ignoreRegex

Select if you want to use regex file loaded in the model.If true the default regex file will be not used.The default value is false.

-
-
isRandomDateDisplacement

Use a random displacement days in dates entities,that random number is based on the [[DeIdentificationParams.seed]] -If true use random displacement days in dates entities,if false use the [[DeIdentificationParams.days]] -The default value is false.

-
-
mappingsColumn

This is the mapping column that will return the Annotations chunks with the fake entities.

-
-
returnEntityMappings

With this property you select if you want to return mapping column

-
-
blackList

List of entities ignored for masking or obfuscation.The default values are: “SSN”,”PASSPORT”,”DLN”,”NPI”,”C_CARD”,”IBAN”,”DEA”

-
-
maskingPolicy
-
Select the masking policy:

same_length_chars: Replace the obfuscated entity with a masking sequence composed of asterisks and surrounding squared brackets, being the total length of the masking sequence of the same length as the original sequence. -Example, Smith -> [***]. -If the entity is less than 3 chars (like Jo, or 5), asterisks without brackets will be returned. -entity_labels: Replace the values with the corresponding entity labels. -fixed_length_chars: Replace the obfuscated entity with a masking sequence composed of a fixed number of asterisks.

-
-
-
-
-
-
-

Examples

-
>>> import sparknlp
->>> from sparknlp.base import *
->>> from sparknlp.common import *
->>> from sparknlp.annotator import *
->>> from sparknlp.training import *
->>> import sparknlp_jsl
->>> from sparknlp_jsl.base import *
->>> from sparknlp_jsl.annotator import *
->>> from pyspark.ml import Pipeline
->>> documentAssembler = DocumentAssembler() \
-...     .setInputCol("text") \
-...     .setOutputCol("document")
-...
->>>  sentenceDetector = SentenceDetector() \
-...     .setInputCols(["document"]) \
-...     .setOutputCol("sentence") \
-...     .setUseAbbreviations(True)
-...
->>> tokenizer = Tokenizer()     ...     .setInputCols(["sentence"])     ...     .setOutputCol("token")
-...
->> embeddings = WordEmbeddingsModel     ...     .pretrained("embeddings_clinical", "en", "clinical/models")     ...     .setInputCols(["sentence", "token"])     ...     .setOutputCol("embeddings")
-...
- Ner entities
->>> clinical_sensitive_entities = MedicalNerModel \
-...     .pretrained("ner_deid_enriched", "en", "clinical/models") \
-...     .setInputCols(["sentence", "token", "embeddings"]).setOutputCol("ner")
-...
->>> nerConverter = NerConverter() \
-...     .setInputCols(["sentence", "token", "ner"]) \
-...     .setOutputCol("ner_con")
- Deidentification
->>> deIdentification = DeIdentification() \
-...     .setInputCols(["ner_chunk", "token", "sentence"]) \
-...     .setOutputCol("dei") \
-...     # file with custom regex pattern for custom entities    ...     .setRegexPatternsDictionary("path/to/dic_regex_patterns_main_categories.txt") \
-...     # file with custom obfuscator names for the entities    ...     .setObfuscateRefFile("path/to/obfuscate_fixed_entities.txt") \
-...     .setRefFileFormat("csv") \
-...     .setRefSep("#") \
-...     .setMode("obfuscate") \
-...     .setDateFormats(Array("MM/dd/yy","yyyy-MM-dd")) \
-...     .setObfuscateDate(True) \
-...     .setDateTag("DATE") \
-...     .setDays(5) \
-...     .setObfuscateRefSource("file")
-Pipeline
->>> data = spark.createDataFrame([
-...     ["# 7194334 Date : 01/13/93 PCP : Oliveira , 25 years-old , Record date : 2079-11-09."]
-...     ]).toDF("text")
->>> pipeline = Pipeline(stages=[
-...     documentAssembler,
-...     sentenceDetector,
-...     tokenizer,
-...     embeddings,
-...     clinical_sensitive_entities,
-...     nerConverter,
-...     deIdentification
-... ])
->>> result = pipeline.fit(data).transform(data)
->>> result.select("dei.result").show(truncate = False)
- +--------------------------------------------------------------------------------------------------+
- |result                                                                                            |
- +--------------------------------------------------------------------------------------------------+
- |[# 01010101 Date : 01/18/93 PCP : Dr. Gregory House , <AGE> years-old , Record date : 2079-11-14.]|
- +--------------------------------------------------------------------------------------------------+
-
-
-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__()

clear(param)

Clears a param from the param map if it has been explicitly set.

copy([extra])

Creates a copy of this instance with the same uid and some extra params.

explainParam(param)

Explains a single param and returns its name, doc, and optional default value and user-supplied value in a string.

explainParams()

Returns the documentation of all params with their optionally default values and user-supplied values.

extractParamMap([extra])

Extracts the embedded default param values and user-supplied values, and then merges them with extra values from input into a flat param map, where the latter value is used if there exist conflicts, i.e., with ordering: default param values < user-supplied values < extra.

fit(dataset[, params])

Fits a model to the input dataset with optional parameters.

fitMultiple(dataset, paramMaps)

Fits a model to the input dataset for each param map in paramMaps.

getBlackList()

getInputCols()

Gets current column names of input annotations.

getLazyAnnotator()

Gets whether Annotator should be evaluated lazily in a RecursivePipeline.

getOrDefault(param)

Gets the value of a param in the user-supplied param map or its default value.

getOutputCol()

Gets output column name of annotations.

getParam(paramName)

Gets a param by its name.

getParamValue(paramName)

Gets the value of a parameter.

hasDefault(param)

Checks whether a param has a default value.

hasParam(paramName)

Tests whether this instance contains a param with a given (string) name.

isDefined(param)

Checks whether a param is explicitly set by user or has a default value.

isSet(param)

Checks whether a param is explicitly set by user.

load(path)

Reads an ML instance from the input path, a shortcut of read().load(path).

read()

Returns an MLReader instance for this class.

save(path)

Save this ML instance to the given path, a shortcut of 'write().save(path)'.

set(param, value)

Sets a parameter in the embedded param map.

setBlackList(s)

List of entities ignored for masking or obfuscation. Parameters ---------- s : list List of entities ignored for masking or obfuscation.The default values are: values are "SSN","PASSPORT","DLN","NPI","C_CARD","IBAN","DEA".

setConsistentObfuscation(s)

Sets whether to replace very similar entities in a document with the same randomized term (default: true).The similarity is based on the Levenshtein Distance between the words.

setDateFormats(s)

Sets list of date formats to automatically displace if parsed

setDateTag(t)

Sets tag representing dates in the obfuscate reference file (default: DATE)

setDateToYear(s)

Sets transform dates into years.

setDays(d)

Sets number of days to obfuscate by displacement the dates.

setFixedMaskLength(length)

Fixed mask length: this is the length of the masking sequence that will be used when the 'fixed_length_chars' masking

setIgnoreRegex(s)

Sets if you want to use regex.

setInputCols(*value)

Sets column names of input annotations.

setIsRandomDateDisplacement(s)

Sets if you want to use random displacement in dates

setLanguage(l)

The language used to select the regex file and some faker entities.'en'(english),'de'() or 'es'(Spanish)

setLazyAnnotator(value)

Sets whether Annotator should be evaluated lazily in a RecursivePipeline.

setMappingsColumn(s)

Sets the name of mapping column that will return the Annotations chunks with the fake entities

setMaskingPolicy(m)

Sets the masking policy:

setMinYear(s)

Sets minimum year to be used when transforming dates into years.

setMode(m)

Sets mode for Anonimizer ['mask'|'obfuscate']

setObfuscateDate(value)

Sets auxiliary label which maps resolved entities to additional labels

setObfuscateRefFile(f)

Set file with the terms to be used for Obfuscation Parameters ---------- f : str File with the terms to be used for Obfuscation

setObfuscateRefSource(s)

Sets mode for select obfuscate source ['both'|'faker'| 'file]

setOutputCol(value)

Sets output column name of annotations.

setParamValue(paramName)

Sets the value of a parameter.

setRefFileFormat(f)

Sets format of the reference file

setRefSep(c)

Sets separator character in refFile

setRegexOverride(s)

Sets whether to prioritize regex over ner entities

setRegexPatternsDictionary(path[, read_as, ...])

Setst dictionary with regular expression patterns that match some protected entity

setReturnEntityMappings(s)

Sets if you want to return mapping column

setSameEntityThreshold(s)

Sets similarity threshold [0.0-1.0] to consider two appearances of an entity as the same (default: 0.9).

setSeed(s)

Sets the seed to select the entities on obfuscate mode

write()

Returns an MLWriter instance for this ML instance.

-

Attributes

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

blackList

consistentObfuscation

dateFormats

dateTag

dateToYear

days

fixedMaskLength

getter_attrs

ignoreRegex

inputCols

isRandomDateDisplacement

language

lazyAnnotator

mappingsColumn

maskingPolicy

minYear

mode

name

obfuscateDate

obfuscateRefFile

obfuscateRefSource

outputCol

params

Returns all params ordered by name.

refFileFormat

refSep

regexOverride

regexPatternsDictionary

returnEntityMappings

sameEntityThreshold

seed

-
-
-clear(param)
-

Clears a param from the param map if it has been explicitly set.

-
- -
-
-copy(extra=None)
-

Creates a copy of this instance with the same uid and some -extra params. This implementation first calls Params.copy and -then make a copy of the companion Java pipeline component with -extra params. So both the Python wrapper and the Java pipeline -component get copied.

-
-
Parameters
-
-
extradict, optional

Extra parameters to copy to the new instance

-
-
-
-
Returns
-
-
JavaParams

Copy of this instance

-
-
-
-
-
- -
-
-explainParam(param)
-

Explains a single param and returns its name, doc, and optional -default value and user-supplied value in a string.

-
- -
-
-explainParams()
-

Returns the documentation of all params with their optionally -default values and user-supplied values.

-
- -
-
-extractParamMap(extra=None)
-

Extracts the embedded default param values and user-supplied -values, and then merges them with extra values from input into -a flat param map, where the latter value is used if there exist -conflicts, i.e., with ordering: default param values < -user-supplied values < extra.

-
-
Parameters
-
-
extradict, optional

extra param values

-
-
-
-
Returns
-
-
dict

merged param map

-
-
-
-
-
- -
-
-fit(dataset, params=None)
-

Fits a model to the input dataset with optional parameters.

-
-

New in version 1.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset.

-
-
paramsdict or list or tuple, optional

an optional param map that overrides embedded params. If a list/tuple of -param maps is given, this calls fit on each param map and returns a list of -models.

-
-
-
-
Returns
-
-
Transformer or a list of Transformer

fitted model(s)

-
-
-
-
-
- -
-
-fitMultiple(dataset, paramMaps)
-

Fits a model to the input dataset for each param map in paramMaps.

-
-

New in version 2.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset.

-
-
paramMapscollections.abc.Sequence

A Sequence of param maps.

-
-
-
-
Returns
-
-
_FitMultipleIterator

A thread safe iterable which contains one model for each param map. Each -call to next(modelIterator) will return (index, model) where model was fit -using paramMaps[index]. index values may not be sequential.

-
-
-
-
-
- -
-
-getInputCols()
-

Gets current column names of input annotations.

-
- -
-
-getLazyAnnotator()
-

Gets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
- -
-
-getOrDefault(param)
-

Gets the value of a param in the user-supplied param map or its -default value. Raises an error if neither is set.

-
- -
-
-getOutputCol()
-

Gets output column name of annotations.

-
- -
-
-getParam(paramName)
-

Gets a param by its name.

-
- -
-
-getParamValue(paramName)
-

Gets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-hasDefault(param)
-

Checks whether a param has a default value.

-
- -
-
-hasParam(paramName)
-

Tests whether this instance contains a param with a given -(string) name.

-
- -
-
-isDefined(param)
-

Checks whether a param is explicitly set by user or has -a default value.

-
- -
-
-isSet(param)
-

Checks whether a param is explicitly set by user.

-
- -
-
-classmethod load(path)
-

Reads an ML instance from the input path, a shortcut of read().load(path).

-
- -
-
-property params
-

Returns all params ordered by name. The default implementation -uses dir() to get all attributes of type -Param.

-
- -
-
-classmethod read()
-

Returns an MLReader instance for this class.

-
- -
-
-save(path)
-

Save this ML instance to the given path, a shortcut of ‘write().save(path)’.

-
- -
-
-set(param, value)
-

Sets a parameter in the embedded param map.

-
- -
-
-setBlackList(s)[source]
-

List of entities ignored for masking or obfuscation. -Parameters -———- -s : list

-
-

List of entities ignored for masking or obfuscation.The default values are: values are “SSN”,”PASSPORT”,”DLN”,”NPI”,”C_CARD”,”IBAN”,”DEA”

-
-
- -
-
-setConsistentObfuscation(s)[source]
-

Sets whether to replace very similar entities in a document with the same randomized term (default: true).The similarity is based on the Levenshtein Distance between the words.

-
-
Parameters
-
-
sstr

Whether to replace very similar entities in a document with the same randomized term .The similarity is based on the Levenshtein Distance between the words.

-
-
-
-
-
- -
-
-setDateFormats(s)[source]
-

Sets list of date formats to automatically displace if parsed

-
-
Parameters
-
-
namestr

List of date formats to automatically displace if parsed

-
-
-
-
-
- -
-
-setDateTag(t)[source]
-

Sets tag representing dates in the obfuscate reference file (default: DATE)

-
-
Parameters
-
-
fstr

Tag representing dates in the obfuscate reference file (default: DATE)

-
-
-
-
-
- -
-
-setDateToYear(s)[source]
-

Sets transform dates into years.

-
-
Parameters
-
-
sbool

True if we want the model to transform dates into years, False otherwise.

-
-
-
-
-
- -
-
-setDays(d)[source]
-

Sets number of days to obfuscate by displacement the dates.

-
-
Parameters
-
-
dint

Number of days to obfuscate by displacement the dates.

-
-
-
-
-
- -
-
-setFixedMaskLength(length)[source]
-
-
Fixed mask length: this is the length of the masking sequence that will be used when the ‘fixed_length_chars’ masking

policy is selected.

-
-
-
-
lengthint

The mask length

-
-
-
- -
-
-setIgnoreRegex(s)[source]
-

Sets if you want to use regex.

-
-
Parameters
-
-
sbool

Whether to use regex.

-
-
-
-
-
- -
-
-setInputCols(*value)
-

Sets column names of input annotations.

-
-
Parameters
-
-
*valuestr

Input columns for the annotator

-
-
-
-
-
- -
-
-setIsRandomDateDisplacement(s)[source]
-

Sets if you want to use random displacement in dates

-
-
Parameters
-
-
sbool

Boolean value to select if you want to use random displacement in dates

-
-
-
-
-
- -
-
-setLanguage(l)[source]
-

The language used to select the regex file and some faker entities.’en’(english),’de’() or ‘es’(Spanish)

-
-
Parameters
-
-
lstr

The language used to select the regex file and some faker entities.’en’(english),’de’() or ‘es’(Spanish)

-
-
-
-
-
- -
-
-setLazyAnnotator(value)
-

Sets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
-
Parameters
-
-
valuebool

Whether Annotator should be evaluated lazily in a -RecursivePipeline

-
-
-
-
-
- -
-
-setMappingsColumn(s)[source]
-

Sets the name of mapping column that will return the Annotations chunks with the fake entities

-
-
Parameters
-
-
namestr

Mapping column that will return the Annotations chunks with the fake entities

-
-
-
-
-
- -
-
-setMaskingPolicy(m)[source]
-
-
Sets the masking policy:

same_length_chars: Replace the obfuscated entity with a masking sequence composed of asterisks and surrounding squared brackets, being the total length of the masking sequence of the same length as the original sequence. -Example, Smith -> [***]. -If the entity is less than 3 chars (like Jo, or 5), asterisks without brackets will be returned. -entity_labels: Replace the values with the corresponding entity labels. -fixed_length_chars: Replace the obfuscated entity with a masking sequence composed of a fixed number of asterisks.

-
-
-
-
Parameters
-
-
mstr

The masking policy

-
-
-
-
-
- -
-
-setMinYear(s)[source]
-

Sets minimum year to be used when transforming dates into years.

-
-
Parameters
-
-
sint

Minimum year to be used when transforming dates into years.

-
-
-
-
-
- -
-
-setMode(m)[source]
-

Sets mode for Anonimizer [‘mask’|’obfuscate’]

-
-
Parameters
-
-
mstr

Mode for Anonimizer [‘mask’|’obfuscate’]

-
-
-
-
-
- -
-
-setObfuscateDate(value)[source]
-

Sets auxiliary label which maps resolved entities to additional labels

-
-
Parameters
-
-
valuestr

When mode==”obfuscate” whether to obfuscate dates or not. This param helps in consistency to make dateFormats more visible. -When setting to true, make sure dateFormats param fits the needs (default: false) -WHen setting to ‘false’ then the date will be mask to <DATE>

-
-
-
-
-
- -
-
-setObfuscateRefFile(f)[source]
-

Set file with the terms to be used for Obfuscation -Parameters -———- -f : str

-
-

File with the terms to be used for Obfuscation

-
-
- -
-
-setObfuscateRefSource(s)[source]
-

Sets mode for select obfuscate source [‘both’|’faker’| ‘file]

-
-
Parameters
-
-
sstr

Mode for select obfuscate source [‘both’|’faker’| ‘file]

-
-
-
-
-
- -
-
-setOutputCol(value)
-

Sets output column name of annotations.

-
-
Parameters
-
-
valuestr

Name of output column

-
-
-
-
-
- -
-
-setParamValue(paramName)
-

Sets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-setRefFileFormat(f)[source]
-

Sets format of the reference file

-
-
Parameters
-
-
fstr

Format of the reference file

-
-
-
-
-
- -
-
-setRefSep(c)[source]
-

Sets separator character in refFile

-
-
Parameters
-
-
fstr

Separator character in refFile

-
-
-
-
-
- -
-
-setRegexOverride(s)[source]
-

Sets whether to prioritize regex over ner entities

-
-
Parameters
-
-
sbool

Whether to prioritize regex over ner entities

-
-
-
-
-
- -
-
-setRegexPatternsDictionary(path, read_as='TEXT', options={'delimiter': ' '})[source]
-

Setst dictionary with regular expression patterns that match some protected entity

-
-
Parameters
-
-
pathstr

Path wher is de dictionary

-
-
read_as: ReadAs

Format of the file

-
-
options: dict

Dictionary with the options to read the file.

-
-
-
-
-
- -
-
-setReturnEntityMappings(s)[source]
-

Sets if you want to return mapping column

-
-
Parameters
-
-
sbool

Whether to return the mappings column.

-
-
-
-
-
- -
-
-setSameEntityThreshold(s)[source]
-

Sets similarity threshold [0.0-1.0] to consider two appearances of an entity as the same (default: 0.9).

-
-
Parameters
-
-
sfloat

Similarity threshold [0.0-1.0] to consider two appearances of an entity as the same (default: 0.9).

-
-
-
-
-
- -
-
-setSeed(s)[source]
-

Sets the seed to select the entities on obfuscate mode

-
-
Parameters
-
-
sint

The seed to select the entities on obfuscate mode

-
-
-
-
-
- -
-
-uid
-

A unique id for the object.

-
- -
-
-write()
-

Returns an MLWriter instance for this ML instance.

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.DeIdentificationModel.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.DeIdentificationModel.html deleted file mode 100644 index 785b63e915..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.DeIdentificationModel.html +++ /dev/null @@ -1,1543 +0,0 @@ - - - - - - - - sparknlp_jsl.annotator.DeIdentificationModel — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.annotator.DeIdentificationModel

-
-
-class sparknlp_jsl.annotator.DeIdentificationModel(classname='com.johnsnowlabs.nlp.annotators.deid.DeIdentificationModel', java_model=None)[source]
-

Bases: sparknlp.common.AnnotatorModel

-

The DeIdentificationModel model can obfuscate or mask the entities that contains personal information. These can be set with a file of -regex patterns with setRegexPatternsDictionary, where each line is a mapping of entity to regex

- ---- - - - - - - - - - - -

Input Annotation types

Output Annotation type

DOCUMENT, CHUNK, TOKEN

DOCUMENT

-
-
Parameters
-
-
regexPatternsDictionary

ictionary with regular expression patterns that match some protected entity

-
-
mode

Mode for Anonimizer [‘mask’|’obfuscate’]

-
-
obfuscateDate

When mode==’obfuscate’ whether to obfuscate dates or not. This param helps in consistency to make dateFormats more visible. When setting to true, make sure dateFormats param fits the needs (default: false)

-
-
dateTag

Tag representing dates in the obfuscate reference file (default: DATE)

-
-
days

Number of days to obfuscate the dates by displacement. If not provided a random integer between 1 and 60 will be used

-
-
dateToYear

True if we want the model to transform dates into years, False otherwise.

-
-
minYear

Minimum year to be used when transforming dates into years.

-
-
dateFormats

List of date formats to automatically displace if parsed

-
-
consistentObfuscation

Whether to replace very similar entities in a document with the same randomized term (default: true) -The similarity is based on the Levenshtein Distance between the words.

-
-
sameEntityThreshold

Similarity threshold [0.0-1.0] to consider two appearances of an entity as the same (default: 0.9).

-
-
obfuscateRefSource

The source of obfuscation of to obfuscate the entities.For dates entities doesnt apply tha method. -The values ar the following: -file: Takes the entities from the obfuscatorRefFile -faker: Takes the entities from the Faker module -both: Takes the entities from the obfuscatorRefFile and the faker module randomly.

-
-
regexOverride

If is true prioritize the regex entities, if is false prioritize the ner.

-
-
seed

It is the seed to select the entities on obfuscate mode.With the seed you can reply a execution several times with the same ouptut.

-
-
ignoreRegex

Select if you want to use regex file loaded in the model.If true the default regex file will be not used.The default value is false.

-
-
isRandomDateDisplacement

Use a random displacement days in dates entities,that random number is based on the [[DeIdentificationParams.seed]] -If true use random displacement days in dates entities,if false use the [[DeIdentificationParams.days]] -The default value is false.

-
-
mappingsColumn

This is the mapping column that will return the Annotations chunks with the fake entities.

-
-
returnEntityMappings

With this property you select if you want to return mapping column

-
-
blackList

List of entities ignored for masking or obfuscation.The default values are: “SSN”,”PASSPORT”,”DLN”,”NPI”,”C_CARD”,”IBAN”,”DEA”

-
-
regexEntities

Keep the regex entities used in the regexPatternDictionary

-
-
maskingPolicy
-
Select the masking policy:

same_length_chars: Replace the obfuscated entity with a masking sequence composed of asterisks and surrounding squared brackets, being the total length of the masking sequence of the same length as the original sequence. -Example, Smith -> [***]. -If the entity is less than 3 chars (like Jo, or 5), asterisks without brackets will be returned. -entity_labels: Replace the values with the corresponding entity labels. -fixed_length_chars: Replace the obfuscated entity with a masking sequence composed of a fixed number of asterisks.

-
-
-
-
fixedMaskLength: this is the length of the masking sequence that will be used when the ‘fixed_length_chars’ masking policy is selected.
-
-
-
-

Examples

-
>>> import sparknlp
->>> from sparknlp.base import *
->>> from sparknlp.common import *
->>> from sparknlp.annotator import *
->>> from sparknlp.training import *
->>> import sparknlp_jsl
->>> from sparknlp_jsl.base import *
->>> from sparknlp_jsl.annotator import *
->>> from pyspark.ml import Pipeline
->>> documentAssembler = DocumentAssembler() \
-...     .setInputCol("text") \
-...     .setOutputCol("document")
-...
->>>  sentenceDetector = SentenceDetector() \
-...     .setInputCols(["document"]) \
-...     .setOutputCol("sentence") \
-...     .setUseAbbreviations(True)
-...
->>> tokenizer = Tokenizer()     ...     .setInputCols(["sentence"])     ...     .setOutputCol("token")
-...
->> embeddings = WordEmbeddingsModel     ...     .pretrained("embeddings_clinical", "en", "clinical/models")     ...     .setInputCols(["sentence", "token"])     ...     .setOutputCol("embeddings")
-...
- Ner entities
->>> clinical_sensitive_entities = MedicalNerModel \
-...     .pretrained("ner_deid_enriched", "en", "clinical/models") \
-...     .setInputCols(["sentence", "token", "embeddings"]).setOutputCol("ner")
-...
->>> nerConverter = NerConverter() \
-...     .setInputCols(["sentence", "token", "ner"]) \
-...     .setOutputCol("ner_con")
-...
- Deidentification
->>> deIdentification = DeIdentificationModel.pretrained("deidentify_large", "en", "clinical/models") \
-...     .setInputCols(["ner_chunk", "token", "sentence"]) \
-...     .setOutputCol("dei") \
-...     .setMode("obfuscate") \
-...     .setDateFormats(Array("MM/dd/yy","yyyy-MM-dd")) \
-...     .setObfuscateDate(True) \
-...     .setDateTag("DATE") \
-...     .setDays(5) \
-...     .setObfuscateRefSource("both")
->>> data = spark.createDataFrame([
-...     ["# 7194334 Date : 01/13/93 PCP : Oliveira , 25 years-old , Record date : 2079-11-09."]
-...     ]).toDF("text")
->>> pipeline = Pipeline(stages=[
-...     documentAssembler,
-...     sentenceDetector,
-...     tokenizer,
-...     embeddings,
-...     clinical_sensitive_entities,
-...     nerConverter,
-...     deIdentification
-... ])
->>> result = pipeline.fit(data).transform(data)
->>> result.select("dei.result").show(truncate = False)
- +--------------------------------------------------------------------------------------------------+
- |result                                                                                            |
- +--------------------------------------------------------------------------------------------------+
- |[# 01010101 Date : 01/18/93 PCP : Dr. Gregory House , <AGE> years-old , Record date : 2079-11-14.]|
- +--------------------------------------------------------------------------------------------------+
-
-
-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__([classname, java_model])

Initialize this instance with a Java model object.

clear(param)

Clears a param from the param map if it has been explicitly set.

copy([extra])

Creates a copy of this instance with the same uid and some extra params.

explainParam(param)

Explains a single param and returns its name, doc, and optional default value and user-supplied value in a string.

explainParams()

Returns the documentation of all params with their optionally default values and user-supplied values.

extractParamMap([extra])

Extracts the embedded default param values and user-supplied values, and then merges them with extra values from input into a flat param map, where the latter value is used if there exist conflicts, i.e., with ordering: default param values < user-supplied values < extra.

getBlackList()

getInputCols()

Gets current column names of input annotations.

getLazyAnnotator()

Gets whether Annotator should be evaluated lazily in a RecursivePipeline.

getOrDefault(param)

Gets the value of a param in the user-supplied param map or its default value.

getOutputCol()

Gets output column name of annotations.

getParam(paramName)

Gets a param by its name.

getParamValue(paramName)

Gets the value of a parameter.

getRegexEntities()

hasDefault(param)

Checks whether a param has a default value.

hasParam(paramName)

Tests whether this instance contains a param with a given (string) name.

isDefined(param)

Checks whether a param is explicitly set by user or has a default value.

isSet(param)

Checks whether a param is explicitly set by user.

load(path)

Reads an ML instance from the input path, a shortcut of read().load(path).

pretrained(name[, lang, remote_loc])

read()

Returns an MLReader instance for this class.

save(path)

Save this ML instance to the given path, a shortcut of 'write().save(path)'.

set(param, value)

Sets a parameter in the embedded param map.

setBlackList(s)

List of entities ignored for masking or obfuscation. Parameters ---------- s : list List of entities ignored for masking or obfuscation.The default values are: "SSN","PASSPORT","DLN","NPI","C_CARD","IBAN","DEA".

setConsistentObfuscation(s)

Sets whether to replace very similar entities in a document with the same randomized term (default: true).The similarity is based on the Levenshtein Distance between the words.

setDateFormats(s)

Sets list of date formats to automatically displace if parsed

setDateTag(t)

Set file with the terms to be used for Obfuscation

setDateToYear(s)

Sets transform dates into years.

setDays(d)

Sets number of days to obfuscate by displacement the dates.

setFixedMaskLength(length)

Fixed mask length: this is the length of the masking sequence that will be used when the 'fixed_length_chars' masking

setIgnoreRegex(s)

Sets if you want to use regex.

setInputCols(*value)

Sets column names of input annotations.

setIsRandomDateDisplacement(s)

Sets if you want to use random displacement in dates

setLanguage(l)

The language used to select the regex file and some faker entities.'en'(english),'de'() or 'es'(Spanish)

setLazyAnnotator(value)

Sets whether Annotator should be evaluated lazily in a RecursivePipeline.

setMappingsColumn(s)

Sets the name of mapping column that will return the Annotations chunks with the fake entities

setMaskingPolicy(m)

Sets the masking policy:

setMinYear(s)

Sets minimum year to be used when transforming dates into years.

setMode(m)

Sets mode for Anonimizer ['mask'|'obfuscate']

setObfuscateDate(value)

Sets auxiliary label which maps resolved entities to additional labels

setObfuscateRefSource(s)

Sets mode for select obfuscate source ['both'|'faker'| 'file]

setOutputCol(value)

Sets output column name of annotations.

setParamValue(paramName)

Sets the value of a parameter.

setParams()

setRegexOverride(s)

Sets whether to prioritize regex over ner entities

setReturnEntityMappings(s)

Sets if you want to return mapping column

setSameEntityThreshold(s)

Sets similarity threshold [0.0-1.0] to consider two appearances of an entity as the same (default: 0.9).

setSeed(s)

Sets the seed to select the entities on obfuscate mode

transform(dataset[, params])

Transforms the input dataset with optional parameters.

write()

Returns an MLWriter instance for this ML instance.

-

Attributes

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

blackList

consistentObfuscation

dateFormats

dateTag

dateToYear

days

fixedMaskLength

getter_attrs

ignoreRegex

inputCols

isRandomDateDisplacement

language

lazyAnnotator

mappingsColumn

maskingPolicy

minYear

mode

name

obfuscateDate

obfuscateRefSource

outputCol

params

Returns all params ordered by name.

regexEntities

regexOverride

returnEntityMappings

sameEntityThreshold

seed

-
-
-clear(param)
-

Clears a param from the param map if it has been explicitly set.

-
- -
-
-copy(extra=None)
-

Creates a copy of this instance with the same uid and some -extra params. This implementation first calls Params.copy and -then make a copy of the companion Java pipeline component with -extra params. So both the Python wrapper and the Java pipeline -component get copied.

-
-
Parameters
-
-
extradict, optional

Extra parameters to copy to the new instance

-
-
-
-
Returns
-
-
JavaParams

Copy of this instance

-
-
-
-
-
- -
-
-explainParam(param)
-

Explains a single param and returns its name, doc, and optional -default value and user-supplied value in a string.

-
- -
-
-explainParams()
-

Returns the documentation of all params with their optionally -default values and user-supplied values.

-
- -
-
-extractParamMap(extra=None)
-

Extracts the embedded default param values and user-supplied -values, and then merges them with extra values from input into -a flat param map, where the latter value is used if there exist -conflicts, i.e., with ordering: default param values < -user-supplied values < extra.

-
-
Parameters
-
-
extradict, optional

extra param values

-
-
-
-
Returns
-
-
dict

merged param map

-
-
-
-
-
- -
-
-getInputCols()
-

Gets current column names of input annotations.

-
- -
-
-getLazyAnnotator()
-

Gets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
- -
-
-getOrDefault(param)
-

Gets the value of a param in the user-supplied param map or its -default value. Raises an error if neither is set.

-
- -
-
-getOutputCol()
-

Gets output column name of annotations.

-
- -
-
-getParam(paramName)
-

Gets a param by its name.

-
- -
-
-getParamValue(paramName)
-

Gets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-hasDefault(param)
-

Checks whether a param has a default value.

-
- -
-
-hasParam(paramName)
-

Tests whether this instance contains a param with a given -(string) name.

-
- -
-
-isDefined(param)
-

Checks whether a param is explicitly set by user or has -a default value.

-
- -
-
-isSet(param)
-

Checks whether a param is explicitly set by user.

-
- -
-
-classmethod load(path)
-

Reads an ML instance from the input path, a shortcut of read().load(path).

-
- -
-
-property params
-

Returns all params ordered by name. The default implementation -uses dir() to get all attributes of type -Param.

-
- -
-
-classmethod read()
-

Returns an MLReader instance for this class.

-
- -
-
-save(path)
-

Save this ML instance to the given path, a shortcut of ‘write().save(path)’.

-
- -
-
-set(param, value)
-

Sets a parameter in the embedded param map.

-
- -
-
-setBlackList(s)[source]
-

List of entities ignored for masking or obfuscation. -Parameters -———- -s : list

-
-

List of entities ignored for masking or obfuscation.The default values are: “SSN”,”PASSPORT”,”DLN”,”NPI”,”C_CARD”,”IBAN”,”DEA”

-
-
- -
-
-setConsistentObfuscation(s)[source]
-

Sets whether to replace very similar entities in a document with the same randomized term (default: true).The similarity is based on the Levenshtein Distance between the words.

-
-
Parameters
-
-
sstr

Whether to replace very similar entities in a document with the same randomized term (default: true).The similarity is based on the Levenshtein Distance between the words.

-
-
-
-
-
- -
-
-setDateFormats(s)[source]
-

Sets list of date formats to automatically displace if parsed

-
-
Parameters
-
-
sstr

List of date formats to automatically displace if parsed

-
-
-
-
-
- -
-
-setDateTag(t)[source]
-

Set file with the terms to be used for Obfuscation

-
-
Parameters
-
-
fstr

File with the terms to be used for Obfuscation

-
-
-
-
-
- -
-
-setDateToYear(s)[source]
-

Sets transform dates into years.

-
-
Parameters
-
-
sbool

True if we want the model to transform dates into years, False otherwise.

-
-
-
-
-
- -
-
-setDays(d)[source]
-

Sets number of days to obfuscate by displacement the dates.

-
-
Parameters
-
-
dint

Number of days to obfuscate by displacement the dates.

-
-
-
-
-
- -
-
-setFixedMaskLength(length)[source]
-
-
Fixed mask length: this is the length of the masking sequence that will be used when the ‘fixed_length_chars’ masking

policy is selected.

-
-
-
-
lengthint

The mask length

-
-
-
- -
-
-setIgnoreRegex(s)[source]
-

Sets if you want to use regex.

-
-
Parameters
-
-
sbool

Whether to use regex.

-
-
-
-
-
- -
-
-setInputCols(*value)
-

Sets column names of input annotations.

-
-
Parameters
-
-
*valuestr

Input columns for the annotator

-
-
-
-
-
- -
-
-setIsRandomDateDisplacement(s)[source]
-

Sets if you want to use random displacement in dates

-
-
Parameters
-
-
sbool

Boolean value to select if you want to use random displacement in dates

-
-
-
-
-
- -
-
-setLanguage(l)[source]
-

The language used to select the regex file and some faker entities.’en’(english),’de’() or ‘es’(Spanish)

-
-
Parameters
-
-
lstr

The language used to select the regex file and some faker entities.’en’(english),’de’() or ‘es’(Spanish)

-
-
-
-
-
- -
-
-setLazyAnnotator(value)
-

Sets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
-
Parameters
-
-
valuebool

Whether Annotator should be evaluated lazily in a -RecursivePipeline

-
-
-
-
-
- -
-
-setMappingsColumn(s)[source]
-

Sets the name of mapping column that will return the Annotations chunks with the fake entities

-
-
Parameters
-
-
namestr

Mapping column that will return the Annotations chunks with the fake entities

-
-
-
-
-
- -
-
-setMaskingPolicy(m)[source]
-
-
Sets the masking policy:

same_length_chars: Replace the obfuscated entity with a masking sequence composed of asterisks and surrounding squared brackets, being the total length of the masking sequence of the same length as the original sequence. -Example, Smith -> [***]. -If the entity is less than 3 chars (like Jo, or 5), asterisks without brackets will be returned. -entity_labels: Replace the values with the corresponding entity labels. -fixed_length_chars: Replace the obfuscated entity with a masking sequence composed of a fixed number of asterisks.

-
-
-
-
Parameters
-
-
mstr

The masking policy

-
-
-
-
-
- -
-
-setMinYear(s)[source]
-

Sets minimum year to be used when transforming dates into years.

-
-
Parameters
-
-
sint

Minimum year to be used when transforming dates into years.

-
-
-
-
-
- -
-
-setMode(m)[source]
-

Sets mode for Anonimizer [‘mask’|’obfuscate’]

-
-
Parameters
-
-
mstr

Mode for Anonimizer [‘mask’|’obfuscate’]

-
-
-
-
-
- -
-
-setObfuscateDate(value)[source]
-

Sets auxiliary label which maps resolved entities to additional labels

-
-
Parameters
-
-
valuestr

When mode==”obfuscate” whether to obfuscate dates or not. This param helps in consistency to make dateFormats more visible. -When setting to true, make sure dateFormats param fits the needs (default: false) -WHen setting to ‘false’ then the date will be mask to <DATE>

-
-
-
-
-
- -
-
-setObfuscateRefSource(s)[source]
-

Sets mode for select obfuscate source [‘both’|’faker’| ‘file]

-
-
Parameters
-
-
sstr

Mode for select obfuscate source [‘both’|’faker’| ‘file]

-
-
-
-
-
- -
-
-setOutputCol(value)
-

Sets output column name of annotations.

-
-
Parameters
-
-
valuestr

Name of output column

-
-
-
-
-
- -
-
-setParamValue(paramName)
-

Sets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-setRegexOverride(s)[source]
-

Sets whether to prioritize regex over ner entities

-
-
Parameters
-
-
sbool

Whether to prioritize regex over ner entities

-
-
-
-
-
- -
-
-setReturnEntityMappings(s)[source]
-

Sets if you want to return mapping column

-
-
Parameters
-
-
sbool

Whether to save the mappings column.

-
-
-
-
-
- -
-
-setSameEntityThreshold(s)[source]
-

Sets similarity threshold [0.0-1.0] to consider two appearances of an entity as the same (default: 0.9).

-
-
Parameters
-
-
sfloat

Similarity threshold [0.0-1.0] to consider two appearances of an entity as the same (default: 0.9).

-
-
-
-
-
- -
-
-setSeed(s)[source]
-

Sets the seed to select the entities on obfuscate mode

-
-
Parameters
-
-
sint

The seed to select the entities on obfuscate mode

-
-
-
-
-
- -
-
-transform(dataset, params=None)
-

Transforms the input dataset with optional parameters.

-
-

New in version 1.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset

-
-
paramsdict, optional

an optional param map that overrides embedded params.

-
-
-
-
Returns
-
-
pyspark.sql.DataFrame

transformed dataset

-
-
-
-
-
- -
-
-uid
-

A unique id for the object.

-
- -
-
-write()
-

Returns an MLWriter instance for this ML instance.

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.DocumentLogRegClassifierApproach.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.DocumentLogRegClassifierApproach.html deleted file mode 100644 index 0d47a35c70..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.DocumentLogRegClassifierApproach.html +++ /dev/null @@ -1,1232 +0,0 @@ - - - - - - - - sparknlp_jsl.annotator.DocumentLogRegClassifierApproach — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.annotator.DocumentLogRegClassifierApproach

-
-
-class sparknlp_jsl.annotator.DocumentLogRegClassifierApproach[source]
-

Bases: sparknlp.common.AnnotatorApproach

-

Trains a model to classify documents with a Logarithmic Regression algorithm. Training data requires columns for -text and their label. The result is a trained GenericClassifierModel.

- ---- - - - - - - - - - - -

Input Annotation types

Output Annotation type

TOKEN `

CATEGORY

-
-
Parameters
-
-
labelCol

Column with the value result we are trying to predict.

-
-
maxIter

maximum number of iterations.

-
-
tol

convergence tolerance after each iteration.

-
-
fitIntercept

whether to fit an intercept term, default is true.

-
-
labels

array to output the label in the original form.

-
-
vectorizationModelPath

specify the vectorization model if it has been already trained.

-
-
classificationModelPath

specify the classification model if it has been already trained.

-
-
-
-
-

Examples

-
>>> import sparknlp
->>> from sparknlp.base import *
->>> from sparknlp.common import *
->>> from sparknlp.annotator import *
->>> from sparknlp.training import *
->>> import sparknlp_jsl
->>> from sparknlp_jsl.base import *
->>> from sparknlp_jsl.annotator import *
->>> from pyspark.ml import Pipeline
-
-
-

An example pipeline could then be defined like this

-
>>> tokenizer = Tokenizer() \
-...    .setInputCols("document") \
-...    .setOutputCol("token")
-...
->>> normalizer = Normalizer() \
-...    .setInputCols("token") \
-...    .setOutputCol("normalized")
-...
->>> stopwords_cleaner = StopWordsCleaner()\
-...    .setInputCols("normalized")\
-...    .setOutputCol("cleanTokens")\
-...    .setCaseSensitive(False)
-...
-...stemmer = Stemmer()     ...    .setInputCols("cleanTokens")     ...    .setOutputCol("stem")
-...
->>> gen_clf = DocumentLogRegClassifierApproach() \
-...    .setLabelColumn("category") \
-...    .setInputCols("stem") \
-...    .setOutputCol("prediction")
-...
->>> pipeline = Pipeline().setStages([
-...    document_assembler,
-...    tokenizer,
-...    normalizer,
-...    stopwords_cleaner,
-...    stemmer,
-...    logreg
-...])
-...
->>> clf_model = pipeline.fit(data)
-
-
-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__()

clear(param)

Clears a param from the param map if it has been explicitly set.

copy([extra])

Creates a copy of this instance with the same uid and some extra params.

explainParam(param)

Explains a single param and returns its name, doc, and optional default value and user-supplied value in a string.

explainParams()

Returns the documentation of all params with their optionally default values and user-supplied values.

extractParamMap([extra])

Extracts the embedded default param values and user-supplied values, and then merges them with extra values from input into a flat param map, where the latter value is used if there exist conflicts, i.e., with ordering: default param values < user-supplied values < extra.

fit(dataset[, params])

Fits a model to the input dataset with optional parameters.

fitMultiple(dataset, paramMaps)

Fits a model to the input dataset for each param map in paramMaps.

getInputCols()

Gets current column names of input annotations.

getLazyAnnotator()

Gets whether Annotator should be evaluated lazily in a RecursivePipeline.

getOrDefault(param)

Gets the value of a param in the user-supplied param map or its default value.

getOutputCol()

Gets output column name of annotations.

getParam(paramName)

Gets a param by its name.

getParamValue(paramName)

Gets the value of a parameter.

hasDefault(param)

Checks whether a param has a default value.

hasParam(paramName)

Tests whether this instance contains a param with a given (string) name.

isDefined(param)

Checks whether a param is explicitly set by user or has a default value.

isSet(param)

Checks whether a param is explicitly set by user.

load(path)

Reads an ML instance from the input path, a shortcut of read().load(path).

read()

Returns an MLReader instance for this class.

save(path)

Save this ML instance to the given path, a shortcut of 'write().save(path)'.

set(param, value)

Sets a parameter in the embedded param map.

setClassificationModelPath(value)

Sets a path to the the classification model if it has been already trained.

setFitIntercept(merge)

Sets whether to fit an intercept term, default is true.

setInputCols(*value)

Sets column names of input annotations.

setLabelColumn(label)

Sets column with the value result we are trying to predict.

setLabels(value)

Sets array to output the label in the original form.

setLazyAnnotator(value)

Sets whether Annotator should be evaluated lazily in a RecursivePipeline.

setMaxIter(k)

Sets maximum number of iterations.

setOutputCol(value)

Sets output column name of annotations.

setParamValue(paramName)

Sets the value of a parameter.

setTol(dist)

Sets convergence tolerance after each iteration.

setVectorizationModelPath(value)

Sets a path to the the classification model if it has been already trained.

write()

Returns an MLWriter instance for this ML instance.

-

Attributes

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

classificationModelPath

fitIntercept

getter_attrs

inputCols

labelCol

labels

lazyAnnotator

maxIter

outputCol

params

Returns all params ordered by name.

tol

vectorizationModelPath

-
-
-clear(param)
-

Clears a param from the param map if it has been explicitly set.

-
- -
-
-copy(extra=None)
-

Creates a copy of this instance with the same uid and some -extra params. This implementation first calls Params.copy and -then make a copy of the companion Java pipeline component with -extra params. So both the Python wrapper and the Java pipeline -component get copied.

-
-
Parameters
-
-
extradict, optional

Extra parameters to copy to the new instance

-
-
-
-
Returns
-
-
JavaParams

Copy of this instance

-
-
-
-
-
- -
-
-explainParam(param)
-

Explains a single param and returns its name, doc, and optional -default value and user-supplied value in a string.

-
- -
-
-explainParams()
-

Returns the documentation of all params with their optionally -default values and user-supplied values.

-
- -
-
-extractParamMap(extra=None)
-

Extracts the embedded default param values and user-supplied -values, and then merges them with extra values from input into -a flat param map, where the latter value is used if there exist -conflicts, i.e., with ordering: default param values < -user-supplied values < extra.

-
-
Parameters
-
-
extradict, optional

extra param values

-
-
-
-
Returns
-
-
dict

merged param map

-
-
-
-
-
- -
-
-fit(dataset, params=None)
-

Fits a model to the input dataset with optional parameters.

-
-

New in version 1.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset.

-
-
paramsdict or list or tuple, optional

an optional param map that overrides embedded params. If a list/tuple of -param maps is given, this calls fit on each param map and returns a list of -models.

-
-
-
-
Returns
-
-
Transformer or a list of Transformer

fitted model(s)

-
-
-
-
-
- -
-
-fitMultiple(dataset, paramMaps)
-

Fits a model to the input dataset for each param map in paramMaps.

-
-

New in version 2.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset.

-
-
paramMapscollections.abc.Sequence

A Sequence of param maps.

-
-
-
-
Returns
-
-
_FitMultipleIterator

A thread safe iterable which contains one model for each param map. Each -call to next(modelIterator) will return (index, model) where model was fit -using paramMaps[index]. index values may not be sequential.

-
-
-
-
-
- -
-
-getInputCols()
-

Gets current column names of input annotations.

-
- -
-
-getLazyAnnotator()
-

Gets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
- -
-
-getOrDefault(param)
-

Gets the value of a param in the user-supplied param map or its -default value. Raises an error if neither is set.

-
- -
-
-getOutputCol()
-

Gets output column name of annotations.

-
- -
-
-getParam(paramName)
-

Gets a param by its name.

-
- -
-
-getParamValue(paramName)
-

Gets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-hasDefault(param)
-

Checks whether a param has a default value.

-
- -
-
-hasParam(paramName)
-

Tests whether this instance contains a param with a given -(string) name.

-
- -
-
-isDefined(param)
-

Checks whether a param is explicitly set by user or has -a default value.

-
- -
-
-isSet(param)
-

Checks whether a param is explicitly set by user.

-
- -
-
-classmethod load(path)
-

Reads an ML instance from the input path, a shortcut of read().load(path).

-
- -
-
-property params
-

Returns all params ordered by name. The default implementation -uses dir() to get all attributes of type -Param.

-
- -
-
-classmethod read()
-

Returns an MLReader instance for this class.

-
- -
-
-save(path)
-

Save this ML instance to the given path, a shortcut of ‘write().save(path)’.

-
- -
-
-set(param, value)
-

Sets a parameter in the embedded param map.

-
- -
-
-setClassificationModelPath(value)[source]
-

Sets a path to the the classification model if it has been already trained.

-
-
Parameters
-
-
labelstr

Path to the the classification model if it has been already trained.

-
-
-
-
-
- -
-
-setFitIntercept(merge)[source]
-

Sets whether to fit an intercept term, default is true.

-
-
Parameters
-
-
labelstr

Whether to fit an intercept term, default is true.

-
-
-
-
-
- -
-
-setInputCols(*value)
-

Sets column names of input annotations.

-
-
Parameters
-
-
*valuestr

Input columns for the annotator

-
-
-
-
-
- -
-
-setLabelColumn(label)[source]
-

Sets column with the value result we are trying to predict.

-
-
Parameters
-
-
labelstr

Column with the value result we are trying to predict.

-
-
-
-
-
- -
-
-setLabels(value)[source]
-

Sets array to output the label in the original form.

-
-
Parameters
-
-
labellist

array to output the label in the original form.

-
-
-
-
-
- -
-
-setLazyAnnotator(value)
-

Sets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
-
Parameters
-
-
valuebool

Whether Annotator should be evaluated lazily in a -RecursivePipeline

-
-
-
-
-
- -
-
-setMaxIter(k)[source]
-

Sets maximum number of iterations.

-
-
Parameters
-
-
kint

Maximum number of iterations.

-
-
-
-
-
- -
-
-setOutputCol(value)
-

Sets output column name of annotations.

-
-
Parameters
-
-
valuestr

Name of output column

-
-
-
-
-
- -
-
-setParamValue(paramName)
-

Sets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-setTol(dist)[source]
-

Sets convergence tolerance after each iteration.

-
-
Parameters
-
-
distfloat

Convergence tolerance after each iteration.

-
-
-
-
-
- -
-
-setVectorizationModelPath(value)[source]
-

Sets a path to the the classification model if it has been already trained.

-
-
Parameters
-
-
labelstr

Path to the the classification model if it has been already trained.

-
-
-
-
-
- -
-
-uid
-

A unique id for the object.

-
- -
-
-write()
-

Returns an MLWriter instance for this ML instance.

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.DocumentLogRegClassifierModel.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.DocumentLogRegClassifierModel.html deleted file mode 100644 index 2e88ca74be..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.DocumentLogRegClassifierModel.html +++ /dev/null @@ -1,1097 +0,0 @@ - - - - - - - - sparknlp_jsl.annotator.DocumentLogRegClassifierModel — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.annotator.DocumentLogRegClassifierModel

-
-
-class sparknlp_jsl.annotator.DocumentLogRegClassifierModel(classname='com.johnsnowlabs.nlp.annotators.classification.DocumentLogRegClassifierModel', java_model=None)[source]
-

Bases: sparknlp.common.AnnotatorModel

-

Classifies documents with a Logarithmic Regression algorithm.

- ---- - - - - - - - - - - -

Input Annotation types

Output Annotation type

TOKEN

CATEGORY

-
-
Parameters
-
-
mergeChunks

Whether to merge all chunks in a document or not (Default: false)

-
-
labels

Array to output the label in the original form.

-
-
vectorizationModel

Vectorization model if it has been already trained.

-
-
classificationModel

Classification model if it has been already trained.

-
-
-
-
-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__([classname, java_model])

Initialize this instance with a Java model object.

clear(param)

Clears a param from the param map if it has been explicitly set.

copy([extra])

Creates a copy of this instance with the same uid and some extra params.

explainParam(param)

Explains a single param and returns its name, doc, and optional default value and user-supplied value in a string.

explainParams()

Returns the documentation of all params with their optionally default values and user-supplied values.

extractParamMap([extra])

Extracts the embedded default param values and user-supplied values, and then merges them with extra values from input into a flat param map, where the latter value is used if there exist conflicts, i.e., with ordering: default param values < user-supplied values < extra.

getInputCols()

Gets current column names of input annotations.

getLazyAnnotator()

Gets whether Annotator should be evaluated lazily in a RecursivePipeline.

getOrDefault(param)

Gets the value of a param in the user-supplied param map or its default value.

getOutputCol()

Gets output column name of annotations.

getParam(paramName)

Gets a param by its name.

getParamValue(paramName)

Gets the value of a parameter.

hasDefault(param)

Checks whether a param has a default value.

hasParam(paramName)

Tests whether this instance contains a param with a given (string) name.

isDefined(param)

Checks whether a param is explicitly set by user or has a default value.

isSet(param)

Checks whether a param is explicitly set by user.

load(path)

Reads an ML instance from the input path, a shortcut of read().load(path).

pretrained(name[, lang, remote_loc])

read()

Returns an MLReader instance for this class.

save(path)

Save this ML instance to the given path, a shortcut of 'write().save(path)'.

set(param, value)

Sets a parameter in the embedded param map.

setClassificationModel(merge)

Sets a path to the the classification model if it has been already trained.

setInputCols(*value)

Sets column names of input annotations.

setLabels(value)

Sets array to output the label in the original form.

setLazyAnnotator(value)

Sets whether Annotator should be evaluated lazily in a RecursivePipeline.

setMergeChunks(merge)

Sets hether to merge all chunks in a document or not (Default: false)

setOutputCol(value)

Sets output column name of annotations.

setParamValue(paramName)

Sets the value of a parameter.

setParams()

setVectorizationModel(merge)

Sets a path to the the classification model if it has been already trained.

transform(dataset[, params])

Transforms the input dataset with optional parameters.

write()

Returns an MLWriter instance for this ML instance.

-

Attributes

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

classificationModel

getter_attrs

inputCols

labels

lazyAnnotator

mergeChunks

name

outputCol

params

Returns all params ordered by name.

vectorizationModel

-
-
-clear(param)
-

Clears a param from the param map if it has been explicitly set.

-
- -
-
-copy(extra=None)
-

Creates a copy of this instance with the same uid and some -extra params. This implementation first calls Params.copy and -then make a copy of the companion Java pipeline component with -extra params. So both the Python wrapper and the Java pipeline -component get copied.

-
-
Parameters
-
-
extradict, optional

Extra parameters to copy to the new instance

-
-
-
-
Returns
-
-
JavaParams

Copy of this instance

-
-
-
-
-
- -
-
-explainParam(param)
-

Explains a single param and returns its name, doc, and optional -default value and user-supplied value in a string.

-
- -
-
-explainParams()
-

Returns the documentation of all params with their optionally -default values and user-supplied values.

-
- -
-
-extractParamMap(extra=None)
-

Extracts the embedded default param values and user-supplied -values, and then merges them with extra values from input into -a flat param map, where the latter value is used if there exist -conflicts, i.e., with ordering: default param values < -user-supplied values < extra.

-
-
Parameters
-
-
extradict, optional

extra param values

-
-
-
-
Returns
-
-
dict

merged param map

-
-
-
-
-
- -
-
-getInputCols()
-

Gets current column names of input annotations.

-
- -
-
-getLazyAnnotator()
-

Gets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
- -
-
-getOrDefault(param)
-

Gets the value of a param in the user-supplied param map or its -default value. Raises an error if neither is set.

-
- -
-
-getOutputCol()
-

Gets output column name of annotations.

-
- -
-
-getParam(paramName)
-

Gets a param by its name.

-
- -
-
-getParamValue(paramName)
-

Gets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-hasDefault(param)
-

Checks whether a param has a default value.

-
- -
-
-hasParam(paramName)
-

Tests whether this instance contains a param with a given -(string) name.

-
- -
-
-isDefined(param)
-

Checks whether a param is explicitly set by user or has -a default value.

-
- -
-
-isSet(param)
-

Checks whether a param is explicitly set by user.

-
- -
-
-classmethod load(path)
-

Reads an ML instance from the input path, a shortcut of read().load(path).

-
- -
-
-property params
-

Returns all params ordered by name. The default implementation -uses dir() to get all attributes of type -Param.

-
- -
-
-classmethod read()
-

Returns an MLReader instance for this class.

-
- -
-
-save(path)
-

Save this ML instance to the given path, a shortcut of ‘write().save(path)’.

-
- -
-
-set(param, value)
-

Sets a parameter in the embedded param map.

-
- -
-
-setClassificationModel(merge)[source]
-

Sets a path to the the classification model if it has been already trained.

-
-
Parameters
-
-
label: :class:`pyspark.ml.PipelineModel`

Classification model if it has been already trained.

-
-
-
-
-
- -
-
-setInputCols(*value)
-

Sets column names of input annotations.

-
-
Parameters
-
-
*valuestr

Input columns for the annotator

-
-
-
-
-
- -
-
-setLabels(value)[source]
-

Sets array to output the label in the original form.

-
-
Parameters
-
-
labellist

array to output the label in the original form.

-
-
-
-
-
- -
-
-setLazyAnnotator(value)
-

Sets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
-
Parameters
-
-
valuebool

Whether Annotator should be evaluated lazily in a -RecursivePipeline

-
-
-
-
-
- -
-
-setMergeChunks(merge)[source]
-

Sets hether to merge all chunks in a document or not (Default: false)

-
-
Parameters
-
-
labellist

whether to merge all chunks in a document or not (Default: false)

-
-
-
-
-
- -
-
-setOutputCol(value)
-

Sets output column name of annotations.

-
-
Parameters
-
-
valuestr

Name of output column

-
-
-
-
-
- -
-
-setParamValue(paramName)
-

Sets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-setVectorizationModel(merge)[source]
-

Sets a path to the the classification model if it has been already trained.

-
-
Parameters
-
-
label: :class:`pyspark.ml.PipelineModel`

Classification model if it has been already trained.

-
-
-
-
-
- -
-
-transform(dataset, params=None)
-

Transforms the input dataset with optional parameters.

-
-

New in version 1.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset

-
-
paramsdict, optional

an optional param map that overrides embedded params.

-
-
-
-
Returns
-
-
pyspark.sql.DataFrame

transformed dataset

-
-
-
-
-
- -
-
-uid
-

A unique id for the object.

-
- -
-
-write()
-

Returns an MLWriter instance for this ML instance.

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.DrugNormalizer.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.DrugNormalizer.html deleted file mode 100644 index 655a68a2a4..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.DrugNormalizer.html +++ /dev/null @@ -1,1072 +0,0 @@ - - - - - - - - sparknlp_jsl.annotator.DrugNormalizer — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.annotator.DrugNormalizer

-
-
-class sparknlp_jsl.annotator.DrugNormalizer[source]
-

Bases: sparknlp.common.AnnotatorModel

-
-
Annotator which normalizes raw text from clinical documents, e.g. scraped web pages or xml documents, from document type columns into Sentence.

Removes all dirty characters from text following one or more input regex patterns. -Can apply non wanted character removal which a specific policy. -Can apply lower case normalization.

-
-
- ---- - - - - - - - - - - -

Input Annotation types

Output Annotation type

DOCUMENT

DOCUMENT

-
-
Parameters
-
-
lowercase

whether to convert strings to lowercase

-
-
policy

policy to remove patterns from text. Defaults “all”

-
-
-
-
-

Examples

-
>>> data = spark.createDataFrame([
-...   ["Sodium Chloride/Potassium Chloride 13bag"],
-...   ["interferon alfa-2b 10 million unit ( 1 ml ) injec"],
-...   ["aspirin 10 meq/ 5 ml oral sol"]
-... ]).toDF("text")
->>> document = DocumentAssembler().setInputCol("text").setOutputCol("document")
->>> drugNormalizer = DrugNormalizer().setInputCols(["document"]).setOutputCol("document_normalized")
->>> trainingPipeline = Pipeline(stages=[document, drugNormalizer])
->>> result = trainingPipeline.fit(data).transform(data)
->>> result.selectExpr("explode(document_normalized.result) as normalized_text").show(truncate=False)
-+----------------------------------------------------+
-|normalized_text                                     |
-+----------------------------------------------------+
-|Sodium Chloride / Potassium Chloride 13 bag         |
-|interferon alfa - 2b 10000000 unt ( 1 ml ) injection|
-|aspirin 2 meq/ml oral solution                      |
-+----------------------------------------------------+
-
-
-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__()

Initialize this instance with a Java model object.

clear(param)

Clears a param from the param map if it has been explicitly set.

copy([extra])

Creates a copy of this instance with the same uid and some extra params.

explainParam(param)

Explains a single param and returns its name, doc, and optional default value and user-supplied value in a string.

explainParams()

Returns the documentation of all params with their optionally default values and user-supplied values.

extractParamMap([extra])

Extracts the embedded default param values and user-supplied values, and then merges them with extra values from input into a flat param map, where the latter value is used if there exist conflicts, i.e., with ordering: default param values < user-supplied values < extra.

getInputCols()

Gets current column names of input annotations.

getLazyAnnotator()

Gets whether Annotator should be evaluated lazily in a RecursivePipeline.

getOrDefault(param)

Gets the value of a param in the user-supplied param map or its default value.

getOutputCol()

Gets output column name of annotations.

getParam(paramName)

Gets a param by its name.

getParamValue(paramName)

Gets the value of a parameter.

hasDefault(param)

Checks whether a param has a default value.

hasParam(paramName)

Tests whether this instance contains a param with a given (string) name.

isDefined(param)

Checks whether a param is explicitly set by user or has a default value.

isSet(param)

Checks whether a param is explicitly set by user.

load(path)

Reads an ML instance from the input path, a shortcut of read().load(path).

read()

Returns an MLReader instance for this class.

save(path)

Save this ML instance to the given path, a shortcut of 'write().save(path)'.

set(param, value)

Sets a parameter in the embedded param map.

setInputCols(*value)

Sets column names of input annotations.

setLazyAnnotator(value)

Sets whether Annotator should be evaluated lazily in a RecursivePipeline.

setLowercase(value)

Sets whether to convert strings to lowercase

setOutputCol(value)

Sets output column name of annotations.

setParamValue(paramName)

Sets the value of a parameter.

setParams()

setPolicy(value)

Sets policy to remove patterns from text.

transform(dataset[, params])

Transforms the input dataset with optional parameters.

write()

Returns an MLWriter instance for this ML instance.

-

Attributes

- ---- - - - - - - - - - - - - - - - - - - - - - - - -

getter_attrs

inputCols

lazyAnnotator

lowercase

outputCol

params

Returns all params ordered by name.

policy

-
-
-clear(param)
-

Clears a param from the param map if it has been explicitly set.

-
- -
-
-copy(extra=None)
-

Creates a copy of this instance with the same uid and some -extra params. This implementation first calls Params.copy and -then make a copy of the companion Java pipeline component with -extra params. So both the Python wrapper and the Java pipeline -component get copied.

-
-
Parameters
-
-
extradict, optional

Extra parameters to copy to the new instance

-
-
-
-
Returns
-
-
JavaParams

Copy of this instance

-
-
-
-
-
- -
-
-explainParam(param)
-

Explains a single param and returns its name, doc, and optional -default value and user-supplied value in a string.

-
- -
-
-explainParams()
-

Returns the documentation of all params with their optionally -default values and user-supplied values.

-
- -
-
-extractParamMap(extra=None)
-

Extracts the embedded default param values and user-supplied -values, and then merges them with extra values from input into -a flat param map, where the latter value is used if there exist -conflicts, i.e., with ordering: default param values < -user-supplied values < extra.

-
-
Parameters
-
-
extradict, optional

extra param values

-
-
-
-
Returns
-
-
dict

merged param map

-
-
-
-
-
- -
-
-getInputCols()
-

Gets current column names of input annotations.

-
- -
-
-getLazyAnnotator()
-

Gets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
- -
-
-getOrDefault(param)
-

Gets the value of a param in the user-supplied param map or its -default value. Raises an error if neither is set.

-
- -
-
-getOutputCol()
-

Gets output column name of annotations.

-
- -
-
-getParam(paramName)
-

Gets a param by its name.

-
- -
-
-getParamValue(paramName)
-

Gets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-hasDefault(param)
-

Checks whether a param has a default value.

-
- -
-
-hasParam(paramName)
-

Tests whether this instance contains a param with a given -(string) name.

-
- -
-
-isDefined(param)
-

Checks whether a param is explicitly set by user or has -a default value.

-
- -
-
-isSet(param)
-

Checks whether a param is explicitly set by user.

-
- -
-
-classmethod load(path)
-

Reads an ML instance from the input path, a shortcut of read().load(path).

-
- -
-
-property params
-

Returns all params ordered by name. The default implementation -uses dir() to get all attributes of type -Param.

-
- -
-
-classmethod read()
-

Returns an MLReader instance for this class.

-
- -
-
-save(path)
-

Save this ML instance to the given path, a shortcut of ‘write().save(path)’.

-
- -
-
-set(param, value)
-

Sets a parameter in the embedded param map.

-
- -
-
-setInputCols(*value)
-

Sets column names of input annotations.

-
-
Parameters
-
-
*valuestr

Input columns for the annotator

-
-
-
-
-
- -
-
-setLazyAnnotator(value)
-

Sets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
-
Parameters
-
-
valuebool

Whether Annotator should be evaluated lazily in a -RecursivePipeline

-
-
-
-
-
- -
-
-setLowercase(value)[source]
-

Sets whether to convert strings to lowercase

-
-
Parameters
-
-
pbool

Whether to convert strings to lowercase

-
-
-
-
-
- -
-
-setOutputCol(value)
-

Sets output column name of annotations.

-
-
Parameters
-
-
valuestr

Name of output column

-
-
-
-
-
- -
-
-setParamValue(paramName)
-

Sets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-setPolicy(value)[source]
-

Sets policy to remove patterns from text.

-
-
Parameters
-
-
pstr

policy to remove patterns from text.

-
-
-
-
-
- -
-
-transform(dataset, params=None)
-

Transforms the input dataset with optional parameters.

-
-

New in version 1.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset

-
-
paramsdict, optional

an optional param map that overrides embedded params.

-
-
-
-
Returns
-
-
pyspark.sql.DataFrame

transformed dataset

-
-
-
-
-
- -
-
-uid
-

A unique id for the object.

-
- -
-
-write()
-

Returns an MLWriter instance for this ML instance.

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.EntityChunkEmbeddings.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.EntityChunkEmbeddings.html deleted file mode 100644 index d3b57419a6..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.EntityChunkEmbeddings.html +++ /dev/null @@ -1,1356 +0,0 @@ - - - - - - - - sparknlp_jsl.annotator.EntityChunkEmbeddings — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.annotator.EntityChunkEmbeddings

-
-
-class sparknlp_jsl.annotator.EntityChunkEmbeddings(classname='com.johnsnowlabs.nlp.annotators.embeddings.EntityChunkEmbeddings', java_model=None)[source]
-

Bases: sparknlp.annotator.BertSentenceEmbeddings

-
-

Weighted average embeddings of multiple named entities chunk annotations

- ---- - - - - - - - - - - -

Input Annotation types

Output Annotation type

DEPENDENCY, CHUNK

SENTENCE_EMBEDDINGS

-
-
-
Parameters
-
-
targetEntities
-

Target entities and their related entities

-
-
-
entityWeights

Relative weights of entities.

-
-
maxSyntacticDistance

Maximal syntactic distance between related entities. Default value is 2.

-
-
-
-
-
-
- ---- - - - - - - - - -

result

drug_embedding”

metformin 125 mg -250 mg coumadin -one pill paracetamol

[-0.267413, 0.07614058, -0.5620966, 0.83838946, 0.8911504] -[0.22319649, -0.07094894, -0.6885556, 0.79176235, 0.82672405] -[-0.10939768, -0.29242, -0.3574444, 0.3981813, 0.79609615]

-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__([classname, java_model])

Initialize this instance with a Java model object.

clear(param)

Clears a param from the param map if it has been explicitly set.

copy([extra])

Creates a copy of this instance with the same uid and some extra params.

explainParam(param)

Explains a single param and returns its name, doc, and optional default value and user-supplied value in a string.

explainParams()

Returns the documentation of all params with their optionally default values and user-supplied values.

extractParamMap([extra])

Extracts the embedded default param values and user-supplied values, and then merges them with extra values from input into a flat param map, where the latter value is used if there exist conflicts, i.e., with ordering: default param values < user-supplied values < extra.

getBatchSize()

Gets current batch size.

getCaseSensitive()

Gets whether to ignore case in tokens for embeddings matching.

getDimension()

Gets embeddings dimension.

getInputCols()

Gets current column names of input annotations.

getLazyAnnotator()

Gets whether Annotator should be evaluated lazily in a RecursivePipeline.

getOrDefault(param)

Gets the value of a param in the user-supplied param map or its default value.

getOutputCol()

Gets output column name of annotations.

getParam(paramName)

Gets a param by its name.

getParamValue(paramName)

Gets the value of a parameter.

getStorageRef()

Gets unique reference name for identification.

hasDefault(param)

Checks whether a param has a default value.

hasParam(paramName)

Tests whether this instance contains a param with a given (string) name.

isDefined(param)

Checks whether a param is explicitly set by user or has a default value.

isSet(param)

Checks whether a param is explicitly set by user.

load(path)

Reads an ML instance from the input path, a shortcut of read().load(path).

loadSavedModel(folder, spark_session)

Loads a locally saved model.

pretrained([name, lang, remote_loc])

Downloads and loads a pretrained model.

read()

Returns an MLReader instance for this class.

save(path)

Save this ML instance to the given path, a shortcut of 'write().save(path)'.

set(param, value)

Sets a parameter in the embedded param map.

setBatchSize(v)

Sets batch size.

setCaseSensitive(value)

Sets whether to ignore case in tokens for embeddings matching.

setConfigProtoBytes(b)

Sets configProto from tensorflow, serialized into byte array.

setDimension(value)

Sets embeddings dimension.

setEntityWeights([weights])

Sets the relative weights of the embeddings of specific entities. By default the dictionary is empty and

setInputCols(*value)

Sets column names of input annotations.

setIsLong(value)

Sets whether to use Long type instead of Int type for inputs buffer.

setLazyAnnotator(value)

Sets whether Annotator should be evaluated lazily in a RecursivePipeline.

setMaxSentenceLength(value)

Sets max sentence length to process.

setMaxSyntacticDistance(distance)

Sets the maximal syntactic distance between related entities. Default value is 2. Parameters ---------- distance : int Maximal syntactic distance.

setOutputCol(value)

Sets output column name of annotations.

setParamValue(paramName)

Sets the value of a parameter.

setParams()

setStorageRef(value)

Sets unique reference name for identification.

setTargetEntities([entities])

Sets the target entities and maps them to their related entities.

transform(dataset[, params])

Transforms the input dataset with optional parameters.

write()

Returns an MLWriter instance for this ML instance.

-

Attributes

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

batchSize

caseSensitive

configProtoBytes

dimension

entityWeights

getter_attrs

inputCols

isLong

lazyAnnotator

maxSentenceLength

maxSyntacticDistance

name

outputCol

params

Returns all params ordered by name.

storageRef

targetEntities

-
-
-clear(param)
-

Clears a param from the param map if it has been explicitly set.

-
- -
-
-copy(extra=None)
-

Creates a copy of this instance with the same uid and some -extra params. This implementation first calls Params.copy and -then make a copy of the companion Java pipeline component with -extra params. So both the Python wrapper and the Java pipeline -component get copied.

-
-
Parameters
-
-
extradict, optional

Extra parameters to copy to the new instance

-
-
-
-
Returns
-
-
JavaParams

Copy of this instance

-
-
-
-
-
- -
-
-explainParam(param)
-

Explains a single param and returns its name, doc, and optional -default value and user-supplied value in a string.

-
- -
-
-explainParams()
-

Returns the documentation of all params with their optionally -default values and user-supplied values.

-
- -
-
-extractParamMap(extra=None)
-

Extracts the embedded default param values and user-supplied -values, and then merges them with extra values from input into -a flat param map, where the latter value is used if there exist -conflicts, i.e., with ordering: default param values < -user-supplied values < extra.

-
-
Parameters
-
-
extradict, optional

extra param values

-
-
-
-
Returns
-
-
dict

merged param map

-
-
-
-
-
- -
-
-getBatchSize()
-

Gets current batch size.

-
-
Returns
-
-
int

Current batch size

-
-
-
-
-
- -
-
-getCaseSensitive()
-

Gets whether to ignore case in tokens for embeddings matching.

-
-
Returns
-
-
bool

Whether to ignore case in tokens for embeddings matching

-
-
-
-
-
- -
-
-getDimension()
-

Gets embeddings dimension.

-
- -
-
-getInputCols()
-

Gets current column names of input annotations.

-
- -
-
-getLazyAnnotator()
-

Gets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
- -
-
-getOrDefault(param)
-

Gets the value of a param in the user-supplied param map or its -default value. Raises an error if neither is set.

-
- -
-
-getOutputCol()
-

Gets output column name of annotations.

-
- -
-
-getParam(paramName)
-

Gets a param by its name.

-
- -
-
-getParamValue(paramName)
-

Gets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-getStorageRef()
-

Gets unique reference name for identification.

-
-
Returns
-
-
str

Unique reference name for identification

-
-
-
-
-
- -
-
-hasDefault(param)
-

Checks whether a param has a default value.

-
- -
-
-hasParam(paramName)
-

Tests whether this instance contains a param with a given -(string) name.

-
- -
-
-isDefined(param)
-

Checks whether a param is explicitly set by user or has -a default value.

-
- -
-
-isSet(param)
-

Checks whether a param is explicitly set by user.

-
- -
-
-classmethod load(path)
-

Reads an ML instance from the input path, a shortcut of read().load(path).

-
- -
-
-static loadSavedModel(folder, spark_session)
-

Loads a locally saved model.

-
-
Parameters
-
-
folderstr

Folder of the saved model

-
-
spark_sessionpyspark.sql.SparkSession

The current SparkSession

-
-
-
-
Returns
-
-
BertSentenceEmbeddings

The restored model

-
-
-
-
-
- -
-
-property params
-

Returns all params ordered by name. The default implementation -uses dir() to get all attributes of type -Param.

-
- -
-
-static pretrained(name='sbiobert_base_cased_mli', lang='en', remote_loc='clinical/models')[source]
-

Downloads and loads a pretrained model.

-
-
Parameters
-
-
namestr, optional

Name of the pretrained model, by default “sent_small_bert_L2_768”

-
-
langstr, optional

Language of the pretrained model, by default “en”

-
-
remote_locstr, optional

Optional remote address of the resource, by default None. Will use -Spark NLPs repositories otherwise.

-
-
-
-
Returns
-
-
BertSentenceEmbeddings

The restored model

-
-
-
-
-
- -
-
-classmethod read()
-

Returns an MLReader instance for this class.

-
- -
-
-save(path)
-

Save this ML instance to the given path, a shortcut of ‘write().save(path)’.

-
- -
-
-set(param, value)
-

Sets a parameter in the embedded param map.

-
- -
-
-setBatchSize(v)
-

Sets batch size.

-
-
Parameters
-
-
vint

Batch size

-
-
-
-
-
- -
-
-setCaseSensitive(value)
-

Sets whether to ignore case in tokens for embeddings matching.

-
-
Parameters
-
-
valuebool

Whether to ignore case in tokens for embeddings matching

-
-
-
-
-
- -
-
-setConfigProtoBytes(b)
-

Sets configProto from tensorflow, serialized into byte array.

-
-
Parameters
-
-
bList[str]

ConfigProto from tensorflow, serialized into byte array

-
-
-
-
-
- -
-
-setDimension(value)
-

Sets embeddings dimension.

-
-
Parameters
-
-
valueint

Embeddings dimension

-
-
-
-
-
- -
-
-setEntityWeights(weights={})[source]
-
-
Sets the relative weights of the embeddings of specific entities. By default the dictionary is empty and

all entities have equal weights. If non-empty and some entity is not in it, then its weight is set to 0.

-
-
-
-
Parameters
-
-
weights:dict[str, float]

Dictionary with the relative weighs of entities. The notation TARGET_ENTITY:RELATED_ENTITY can be used to -specify the weight of a entity which is related to specific target entity (e.g. “DRUG:SYMPTOM”: 0.3). -Entity names are case insensitive.

-
-
-
-
-
- -
-
-setInputCols(*value)
-

Sets column names of input annotations.

-
-
Parameters
-
-
*valuestr

Input columns for the annotator

-
-
-
-
-
- -
-
-setIsLong(value)
-

Sets whether to use Long type instead of Int type for inputs buffer.

-

Some Bert models require Long instead of Int.

-
-
Parameters
-
-
valuebool

Whether to use Long type instead of Int type for inputs buffer

-
-
-
-
-
- -
-
-setLazyAnnotator(value)
-

Sets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
-
Parameters
-
-
valuebool

Whether Annotator should be evaluated lazily in a -RecursivePipeline

-
-
-
-
-
- -
-
-setMaxSentenceLength(value)
-

Sets max sentence length to process.

-
-
Parameters
-
-
valueint

Max sentence length to process

-
-
-
-
-
- -
-
-setMaxSyntacticDistance(distance)[source]
-

Sets the maximal syntactic distance between related entities. Default value is 2. -Parameters -———- -distance : int

-
-

Maximal syntactic distance

-
-
- -
-
-setOutputCol(value)
-

Sets output column name of annotations.

-
-
Parameters
-
-
valuestr

Name of output column

-
-
-
-
-
- -
-
-setParamValue(paramName)
-

Sets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-setStorageRef(value)
-

Sets unique reference name for identification.

-
-
Parameters
-
-
valuestr

Unique reference name for identification

-
-
-
-
-
- -
-
-setTargetEntities(entities={})[source]
-

Sets the target entities and maps them to their related entities. A target entity with an empty list of -related entities means all other entities are assumed to be related to it.

-
-
Parameters
-
-
entities: dict[str, list[str]]

Dictionary with target and related entities (TARGET: [RELATED1, RELATED2,…]). If the list of related -entities is empty, then all non-target entities are considered. -Entity names are case insensitive.

-
-
-
-
-
- -
-
-transform(dataset, params=None)
-

Transforms the input dataset with optional parameters.

-
-

New in version 1.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset

-
-
paramsdict, optional

an optional param map that overrides embedded params.

-
-
-
-
Returns
-
-
pyspark.sql.DataFrame

transformed dataset

-
-
-
-
-
- -
-
-uid
-

A unique id for the object.

-
- -
-
-write()
-

Returns an MLWriter instance for this ML instance.

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.GenericClassifierApproach.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.GenericClassifierApproach.html deleted file mode 100644 index 088f52f995..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.GenericClassifierApproach.html +++ /dev/null @@ -1,1286 +0,0 @@ - - - - - - - - sparknlp_jsl.annotator.GenericClassifierApproach — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.annotator.GenericClassifierApproach

-
-
-class sparknlp_jsl.annotator.GenericClassifierApproach(classname='com.johnsnowlabs.nlp.annotators.generic_classifier.GenericClassifierApproach')[source]
-

Bases: sparknlp.common.AnnotatorApproach

-

Trains a TensorFlow model for generic classification of feature vectors. It takes FEATURE_VECTOR annotations from -FeaturesAssembler` as input, classifies them and outputs CATEGORY annotations.

- ---- - - - - - - - - - - -

Input Annotation types

Output Annotation type

FEATURE_VECTOR

CATEGORY

-
-
Parameters
-
-
labelColumn

Column with one label per document

-
-
batchSize

Size for each batch in the optimization process

-
-
epochsN

Number of epochs for the optimization process

-
-
learningRate

Learning rate for the optimization proces

-
-
dropou

Dropout at the output of each laye

-
-
validationSplit

Validaiton split - how much data to use for validation

-
-
modelFile

File name to load the mode from

-
-
fixImbalance

A flag indicating whenther to balance the trainig set

-
-
featureScaling

Feature scaling method. Possible values are ‘zscore’, ‘minmax’ or empty (no scaling)

-
-
outputLogsPath

Path to folder where logs will be saved. If no path is specified, no logs are generated

-
-
-
-
-

Examples

-
>>> import sparknlp
->>> from sparknlp.base import *
->>> from sparknlp.common import *
->>> from sparknlp.annotator import *
->>> from sparknlp.training import *
->>> import sparknlp_jsl
->>> from sparknlp_jsl.base import *
->>> from sparknlp_jsl.annotator import *
->>> from pyspark.ml import Pipeline
->>> features_asm = FeaturesAssembler()     ...    .setInputCols(["feature_1", "feature_2", "...", "feature_n"])     ...    .setOutputCol("features")
-...
->>> gen_clf = GenericClassifierApproach() \
-...    .setLabelColumn("target") \
-...    .setInputCols(["features"]) \
-...    .setOutputCol("prediction") \
-...    .setModelFile("/path/to/graph_file.pb") \
-...    .setEpochsNumber(50) \
-...    .setBatchSize(100) \
-...    .setFeatureScaling("zscore") \
-...    .setlearningRate(0.001) \
-...    .setFixImbalance(True) \
-...    .setOutputLogsPath("logs") \
-...    .setValidationSplit(0.2) # keep 20% of the data for validation purposes
-...
->>> pipeline = Pipeline().setStages([
-...    features_asm,
-...    gen_clf
-...])
-...
->>> clf_model = pipeline.fit(data)
-
-
-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__([classname])

clear(param)

Clears a param from the param map if it has been explicitly set.

copy([extra])

Creates a copy of this instance with the same uid and some extra params.

explainParam(param)

Explains a single param and returns its name, doc, and optional default value and user-supplied value in a string.

explainParams()

Returns the documentation of all params with their optionally default values and user-supplied values.

extractParamMap([extra])

Extracts the embedded default param values and user-supplied values, and then merges them with extra values from input into a flat param map, where the latter value is used if there exist conflicts, i.e., with ordering: default param values < user-supplied values < extra.

fit(dataset[, params])

Fits a model to the input dataset with optional parameters.

fitMultiple(dataset, paramMaps)

Fits a model to the input dataset for each param map in paramMaps.

getInputCols()

Gets current column names of input annotations.

getLazyAnnotator()

Gets whether Annotator should be evaluated lazily in a RecursivePipeline.

getOrDefault(param)

Gets the value of a param in the user-supplied param map or its default value.

getOutputCol()

Gets output column name of annotations.

getParam(paramName)

Gets a param by its name.

getParamValue(paramName)

Gets the value of a parameter.

hasDefault(param)

Checks whether a param has a default value.

hasParam(paramName)

Tests whether this instance contains a param with a given (string) name.

isDefined(param)

Checks whether a param is explicitly set by user or has a default value.

isSet(param)

Checks whether a param is explicitly set by user.

load(path)

Reads an ML instance from the input path, a shortcut of read().load(path).

read()

Returns an MLReader instance for this class.

save(path)

Save this ML instance to the given path, a shortcut of 'write().save(path)'.

set(param, value)

Sets a parameter in the embedded param map.

setBatchSize(size)

Size for each batch in the optimization process

setDropout(dropout)

Sets drouptup

setEpochsNumber(epochs)

Sets number of epochs for the optimization process

setFeatureScaling(feature_scaling)

Sets Feature scaling method.

setFixImbalance(fix_imbalance)

Sets A flag indicating whenther to balance the trainig set.

setInputCols(*value)

Sets column names of input annotations.

setLabelCol(label_column)

Sets Size for each batch in the optimization process

setLazyAnnotator(value)

Sets whether Annotator should be evaluated lazily in a RecursivePipeline.

setLearningRate(lamda)

Sets learning rate for the optimization process

setModelFile(mode_file)

Sets file name to load the mode from"

setOutputCol(value)

Sets output column name of annotations.

setOutputLogsPath(output_logs_path)

Sets path to folder where logs will be saved.

setParamValue(paramName)

Sets the value of a parameter.

setValidationSplit(validation_split)

Sets validaiton split - how much data to use for validation

write()

Returns an MLWriter instance for this ML instance.

-

Attributes

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

batchSize

dropout

epochsN

featureScaling

fixImbalance

getter_attrs

inputCols

labelColumn

lazyAnnotator

learningRate

modelFile

outputCol

outputLogsPath

params

Returns all params ordered by name.

validationSplit

-
-
-clear(param)
-

Clears a param from the param map if it has been explicitly set.

-
- -
-
-copy(extra=None)
-

Creates a copy of this instance with the same uid and some -extra params. This implementation first calls Params.copy and -then make a copy of the companion Java pipeline component with -extra params. So both the Python wrapper and the Java pipeline -component get copied.

-
-
Parameters
-
-
extradict, optional

Extra parameters to copy to the new instance

-
-
-
-
Returns
-
-
JavaParams

Copy of this instance

-
-
-
-
-
- -
-
-explainParam(param)
-

Explains a single param and returns its name, doc, and optional -default value and user-supplied value in a string.

-
- -
-
-explainParams()
-

Returns the documentation of all params with their optionally -default values and user-supplied values.

-
- -
-
-extractParamMap(extra=None)
-

Extracts the embedded default param values and user-supplied -values, and then merges them with extra values from input into -a flat param map, where the latter value is used if there exist -conflicts, i.e., with ordering: default param values < -user-supplied values < extra.

-
-
Parameters
-
-
extradict, optional

extra param values

-
-
-
-
Returns
-
-
dict

merged param map

-
-
-
-
-
- -
-
-fit(dataset, params=None)
-

Fits a model to the input dataset with optional parameters.

-
-

New in version 1.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset.

-
-
paramsdict or list or tuple, optional

an optional param map that overrides embedded params. If a list/tuple of -param maps is given, this calls fit on each param map and returns a list of -models.

-
-
-
-
Returns
-
-
Transformer or a list of Transformer

fitted model(s)

-
-
-
-
-
- -
-
-fitMultiple(dataset, paramMaps)
-

Fits a model to the input dataset for each param map in paramMaps.

-
-

New in version 2.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset.

-
-
paramMapscollections.abc.Sequence

A Sequence of param maps.

-
-
-
-
Returns
-
-
_FitMultipleIterator

A thread safe iterable which contains one model for each param map. Each -call to next(modelIterator) will return (index, model) where model was fit -using paramMaps[index]. index values may not be sequential.

-
-
-
-
-
- -
-
-getInputCols()
-

Gets current column names of input annotations.

-
- -
-
-getLazyAnnotator()
-

Gets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
- -
-
-getOrDefault(param)
-

Gets the value of a param in the user-supplied param map or its -default value. Raises an error if neither is set.

-
- -
-
-getOutputCol()
-

Gets output column name of annotations.

-
- -
-
-getParam(paramName)
-

Gets a param by its name.

-
- -
-
-getParamValue(paramName)
-

Gets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-hasDefault(param)
-

Checks whether a param has a default value.

-
- -
-
-hasParam(paramName)
-

Tests whether this instance contains a param with a given -(string) name.

-
- -
-
-isDefined(param)
-

Checks whether a param is explicitly set by user or has -a default value.

-
- -
-
-isSet(param)
-

Checks whether a param is explicitly set by user.

-
- -
-
-classmethod load(path)
-

Reads an ML instance from the input path, a shortcut of read().load(path).

-
- -
-
-property params
-

Returns all params ordered by name. The default implementation -uses dir() to get all attributes of type -Param.

-
- -
-
-classmethod read()
-

Returns an MLReader instance for this class.

-
- -
-
-save(path)
-

Save this ML instance to the given path, a shortcut of ‘write().save(path)’.

-
- -
-
-set(param, value)
-

Sets a parameter in the embedded param map.

-
- -
-
-setBatchSize(size)[source]
-

Size for each batch in the optimization process

-
-
Parameters
-
-
sizeint

Size for each batch in the optimization process

-
-
-
-
-
- -
-
-setDropout(dropout)[source]
-

Sets drouptup

-
-
Parameters
-
-
dropoutfloat

Dropout at the output of each layer

-
-
-
-
-
- -
-
-setEpochsNumber(epochs)[source]
-

Sets number of epochs for the optimization process

-
-
Parameters
-
-
epochsint

Number of epochs for the optimization process

-
-
-
-
-
- -
-
-setFeatureScaling(feature_scaling)[source]
-

Sets Feature scaling method. Possible values are ‘zscore’, ‘minmax’ or empty (no scaling

-
-
Parameters
-
-
feature_scalingstr

Feature scaling method. Possible values are ‘zscore’, ‘minmax’ or empty (no scaling

-
-
-
-
-
- -
-
-setFixImbalance(fix_imbalance)[source]
-

Sets A flag indicating whenther to balance the trainig set.

-
-
Parameters
-
-
fix_imbalancebool

A flag indicating whenther to balance the trainig set.

-
-
-
-
-
- -
-
-setInputCols(*value)
-

Sets column names of input annotations.

-
-
Parameters
-
-
*valuestr

Input columns for the annotator

-
-
-
-
-
- -
-
-setLabelCol(label_column)[source]
-

Sets Size for each batch in the optimization process

-
-
Parameters
-
-
labelstr

Column with the value result we are trying to predict.

-
-
-
-
-
- -
-
-setLazyAnnotator(value)
-

Sets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
-
Parameters
-
-
valuebool

Whether Annotator should be evaluated lazily in a -RecursivePipeline

-
-
-
-
-
- -
-
-setLearningRate(lamda)[source]
-

Sets learning rate for the optimization process

-
-
Parameters
-
-
lamdafloat

Learning rate for the optimization process

-
-
-
-
-
- -
-
-setModelFile(mode_file)[source]
-

Sets file name to load the mode from”

-
-
Parameters
-
-
labelstr

File name to load the mode from”

-
-
-
-
-
- -
-
-setOutputCol(value)
-

Sets output column name of annotations.

-
-
Parameters
-
-
valuestr

Name of output column

-
-
-
-
-
- -
-
-setOutputLogsPath(output_logs_path)[source]
-

Sets path to folder where logs will be saved. If no path is specified, no logs are generated

-
-
Parameters
-
-
labelstr

Path to folder where logs will be saved. If no path is specified, no logs are generated

-
-
-
-
-
- -
-
-setParamValue(paramName)
-

Sets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-setValidationSplit(validation_split)[source]
-

Sets validaiton split - how much data to use for validation

-
-
Parameters
-
-
validation_splitfloat

Validaiton split - how much data to use for validation

-
-
-
-
-
- -
-
-uid
-

A unique id for the object.

-
- -
-
-write()
-

Returns an MLWriter instance for this ML instance.

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.GenericClassifierModel.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.GenericClassifierModel.html deleted file mode 100644 index df8e8916d1..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.GenericClassifierModel.html +++ /dev/null @@ -1,1033 +0,0 @@ - - - - - - - - sparknlp_jsl.annotator.GenericClassifierModel — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.annotator.GenericClassifierModel

-
-
-class sparknlp_jsl.annotator.GenericClassifierModel(classname='com.johnsnowlabs.nlp.annotators.generic_classifier.GenericClassifierModel', java_model=None)[source]
-

Bases: sparknlp.common.AnnotatorModel

-

Generic classifier of feature vectors. It takes FEATURE_VECTOR annotations from -FeaturesAssembler` as input, classifies them and outputs CATEGORY annotations.

- ---- - - - - - - - - - - -

Input Annotation types

Output Annotation type

FEATURE_VECTOR

CATEGORY

-

Examples

-
>>> import sparknlp
->>> from sparknlp.base import *
->>> from sparknlp.common import *
->>> from sparknlp.annotator import *
->>> from sparknlp.training import *
->>> import sparknlp_jsl
->>> from sparknlp_jsl.base import *
->>> from sparknlp_jsl.annotator import *
->>> from pyspark.ml import Pipeline
->>> features_asm = FeaturesAssembler() \
-...    .setInputCols(["feature_1", "feature_2", "...", "feature_n"]) \
-...    .setOutputCol("features")
-...
->>> gen_clf = GenericClassifierModel.pretrained() \
-...    .setInputCols(["features"]) \
-...    .setOutputCol("prediction") \
-...
->>> pipeline = Pipeline().setStages([
-...    features_asm,
-...    gen_clf
-...])
-...
->>> clf_model = pipeline.fit(data)
-
-
-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__([classname, java_model])

Initialize this instance with a Java model object.

clear(param)

Clears a param from the param map if it has been explicitly set.

copy([extra])

Creates a copy of this instance with the same uid and some extra params.

explainParam(param)

Explains a single param and returns its name, doc, and optional default value and user-supplied value in a string.

explainParams()

Returns the documentation of all params with their optionally default values and user-supplied values.

extractParamMap([extra])

Extracts the embedded default param values and user-supplied values, and then merges them with extra values from input into a flat param map, where the latter value is used if there exist conflicts, i.e., with ordering: default param values < user-supplied values < extra.

getInputCols()

Gets current column names of input annotations.

getLazyAnnotator()

Gets whether Annotator should be evaluated lazily in a RecursivePipeline.

getOrDefault(param)

Gets the value of a param in the user-supplied param map or its default value.

getOutputCol()

Gets output column name of annotations.

getParam(paramName)

Gets a param by its name.

getParamValue(paramName)

Gets the value of a parameter.

hasDefault(param)

Checks whether a param has a default value.

hasParam(paramName)

Tests whether this instance contains a param with a given (string) name.

isDefined(param)

Checks whether a param is explicitly set by user or has a default value.

isSet(param)

Checks whether a param is explicitly set by user.

load(path)

Reads an ML instance from the input path, a shortcut of read().load(path).

pretrained(name[, lang, remote_loc])

read()

Returns an MLReader instance for this class.

save(path)

Save this ML instance to the given path, a shortcut of 'write().save(path)'.

set(param, value)

Sets a parameter in the embedded param map.

setInputCols(*value)

Sets column names of input annotations.

setLazyAnnotator(value)

Sets whether Annotator should be evaluated lazily in a RecursivePipeline.

setOutputCol(value)

Sets output column name of annotations.

setParamValue(paramName)

Sets the value of a parameter.

setParams()

transform(dataset[, params])

Transforms the input dataset with optional parameters.

write()

Returns an MLWriter instance for this ML instance.

-

Attributes

- ---- - - - - - - - - - - - - - - - - - - - - - - - -

classes

getter_attrs

inputCols

lazyAnnotator

name

outputCol

params

Returns all params ordered by name.

-
-
-clear(param)
-

Clears a param from the param map if it has been explicitly set.

-
- -
-
-copy(extra=None)
-

Creates a copy of this instance with the same uid and some -extra params. This implementation first calls Params.copy and -then make a copy of the companion Java pipeline component with -extra params. So both the Python wrapper and the Java pipeline -component get copied.

-
-
Parameters
-
-
extradict, optional

Extra parameters to copy to the new instance

-
-
-
-
Returns
-
-
JavaParams

Copy of this instance

-
-
-
-
-
- -
-
-explainParam(param)
-

Explains a single param and returns its name, doc, and optional -default value and user-supplied value in a string.

-
- -
-
-explainParams()
-

Returns the documentation of all params with their optionally -default values and user-supplied values.

-
- -
-
-extractParamMap(extra=None)
-

Extracts the embedded default param values and user-supplied -values, and then merges them with extra values from input into -a flat param map, where the latter value is used if there exist -conflicts, i.e., with ordering: default param values < -user-supplied values < extra.

-
-
Parameters
-
-
extradict, optional

extra param values

-
-
-
-
Returns
-
-
dict

merged param map

-
-
-
-
-
- -
-
-getInputCols()
-

Gets current column names of input annotations.

-
- -
-
-getLazyAnnotator()
-

Gets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
- -
-
-getOrDefault(param)
-

Gets the value of a param in the user-supplied param map or its -default value. Raises an error if neither is set.

-
- -
-
-getOutputCol()
-

Gets output column name of annotations.

-
- -
-
-getParam(paramName)
-

Gets a param by its name.

-
- -
-
-getParamValue(paramName)
-

Gets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-hasDefault(param)
-

Checks whether a param has a default value.

-
- -
-
-hasParam(paramName)
-

Tests whether this instance contains a param with a given -(string) name.

-
- -
-
-isDefined(param)
-

Checks whether a param is explicitly set by user or has -a default value.

-
- -
-
-isSet(param)
-

Checks whether a param is explicitly set by user.

-
- -
-
-classmethod load(path)
-

Reads an ML instance from the input path, a shortcut of read().load(path).

-
- -
-
-property params
-

Returns all params ordered by name. The default implementation -uses dir() to get all attributes of type -Param.

-
- -
-
-classmethod read()
-

Returns an MLReader instance for this class.

-
- -
-
-save(path)
-

Save this ML instance to the given path, a shortcut of ‘write().save(path)’.

-
- -
-
-set(param, value)
-

Sets a parameter in the embedded param map.

-
- -
-
-setInputCols(*value)
-

Sets column names of input annotations.

-
-
Parameters
-
-
*valuestr

Input columns for the annotator

-
-
-
-
-
- -
-
-setLazyAnnotator(value)
-

Sets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
-
Parameters
-
-
valuebool

Whether Annotator should be evaluated lazily in a -RecursivePipeline

-
-
-
-
-
- -
-
-setOutputCol(value)
-

Sets output column name of annotations.

-
-
Parameters
-
-
valuestr

Name of output column

-
-
-
-
-
- -
-
-setParamValue(paramName)
-

Sets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-transform(dataset, params=None)
-

Transforms the input dataset with optional parameters.

-
-

New in version 1.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset

-
-
paramsdict, optional

an optional param map that overrides embedded params.

-
-
-
-
Returns
-
-
pyspark.sql.DataFrame

transformed dataset

-
-
-
-
-
- -
-
-uid
-

A unique id for the object.

-
- -
-
-write()
-

Returns an MLWriter instance for this ML instance.

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.IOBTagger.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.IOBTagger.html deleted file mode 100644 index 1478af6feb..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.IOBTagger.html +++ /dev/null @@ -1,1063 +0,0 @@ - - - - - - - - sparknlp_jsl.annotator.IOBTagger — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.annotator.IOBTagger

-
-
-class sparknlp_jsl.annotator.IOBTagger(classname='com.johnsnowlabs.nlp.annotators.ner.IOBTagger', java_model=None)[source]
-

Bases: sparknlp.common.AnnotatorModel

-

Merges token tags and NER labels from chunks in the specified format. -For example output columns as inputs from

- ---- - - - - - - - - - - -

Input Annotation types

Output Annotation type

TOKEN, CHUNK

NAMED_ENTITY

-
-
Parameters
-
-
Scheme

Format of tags, either IOB or BIOES

-
-
-
-
-

Examples

-
>>> import sparknlp
->>> from sparknlp.base import *
->>> from sparknlp.common import *
->>> from sparknlp.annotator import *
->>> from sparknlp.training import *
->>> import sparknlp_jsl
->>> from sparknlp_jsl.base import *
->>> from sparknlp_jsl.annotator import *
->>> from pyspark.ml import Pipeline
->>> documentAssembler = DocumentAssembler() \
->>> data = spark.createDataFrame([["A 63-year-old man presents to the hospital ..."]]).toDF("text")
->>> documentAssembler = DocumentAssembler().setInputCol("text").setOutputCol("document")
->>> sentenceDetector = SentenceDetector().setInputCols(["document"]).setOutputCol("sentence")
->>> tokenizer = Tokenizer().setInputCols(["sentence"]).setOutputCol("token")
->>> embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models").setOutputCol("embs")
->>> nerModel = MedicalNerModel.pretrained("ner_jsl", "en", "clinical/models").setInputCols(["sentence", "token", "embs"]).setOutputCol("ner")
->>> nerConverter = NerConverter().setInputCols(["sentence", "token", "ner"]).setOutputCol("ner_chunk")
-...
->>> iobTagger = IOBTagger().setInputCols(["token", "ner_chunk"]).setOutputCol("ner_label")
->>> pipeline = Pipeline(stages=[documentAssembler, sentenceDetector, tokenizer, embeddings, nerModel, nerConverter, iobTagger])
-...
->>> result.selectExpr("explode(ner_label) as a")     ...   .selectExpr("a.begin","a.end","a.result as chunk","a.metadata.word as word")     ...   .where("chunk!='O'").show(5, False)
-+-----+---+-----------+-----------+
-|begin|end|chunk      |word       |
-+-----+---+-----------+-----------+
-|5    |15 |B-Age      |63-year-old|
-|17   |19 |B-Gender   |man        |
-|64   |72 |B-Modifier |recurrent  |
-|98   |107|B-Diagnosis|cellulitis |
-|110  |119|B-Diagnosis|pneumonias |
-+-----+---+-----------+-----------+
-
-
-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__([classname, java_model])

Initialize this instance with a Java model object.

clear(param)

Clears a param from the param map if it has been explicitly set.

copy([extra])

Creates a copy of this instance with the same uid and some extra params.

explainParam(param)

Explains a single param and returns its name, doc, and optional default value and user-supplied value in a string.

explainParams()

Returns the documentation of all params with their optionally default values and user-supplied values.

extractParamMap([extra])

Extracts the embedded default param values and user-supplied values, and then merges them with extra values from input into a flat param map, where the latter value is used if there exist conflicts, i.e., with ordering: default param values < user-supplied values < extra.

getInputCols()

Gets current column names of input annotations.

getLazyAnnotator()

Gets whether Annotator should be evaluated lazily in a RecursivePipeline.

getOrDefault(param)

Gets the value of a param in the user-supplied param map or its default value.

getOutputCol()

Gets output column name of annotations.

getParam(paramName)

Gets a param by its name.

getParamValue(paramName)

Gets the value of a parameter.

hasDefault(param)

Checks whether a param has a default value.

hasParam(paramName)

Tests whether this instance contains a param with a given (string) name.

isDefined(param)

Checks whether a param is explicitly set by user or has a default value.

isSet(param)

Checks whether a param is explicitly set by user.

load(path)

Reads an ML instance from the input path, a shortcut of read().load(path).

read()

Returns an MLReader instance for this class.

save(path)

Save this ML instance to the given path, a shortcut of 'write().save(path)'.

set(param, value)

Sets a parameter in the embedded param map.

setInputCols(*value)

Sets column names of input annotations.

setLazyAnnotator(value)

Sets whether Annotator should be evaluated lazily in a RecursivePipeline.

setOutputCol(value)

Sets output column name of annotations.

setParamValue(paramName)

Sets the value of a parameter.

setParams()

setScheme(f)

Sets format of tags, either IOB or BIOES

transform(dataset[, params])

Transforms the input dataset with optional parameters.

write()

Returns an MLWriter instance for this ML instance.

-

Attributes

- ---- - - - - - - - - - - - - - - - - - - - - - - - -

getter_attrs

inputCols

lazyAnnotator

name

outputCol

params

Returns all params ordered by name.

scheme

-
-
-clear(param)
-

Clears a param from the param map if it has been explicitly set.

-
- -
-
-copy(extra=None)
-

Creates a copy of this instance with the same uid and some -extra params. This implementation first calls Params.copy and -then make a copy of the companion Java pipeline component with -extra params. So both the Python wrapper and the Java pipeline -component get copied.

-
-
Parameters
-
-
extradict, optional

Extra parameters to copy to the new instance

-
-
-
-
Returns
-
-
JavaParams

Copy of this instance

-
-
-
-
-
- -
-
-explainParam(param)
-

Explains a single param and returns its name, doc, and optional -default value and user-supplied value in a string.

-
- -
-
-explainParams()
-

Returns the documentation of all params with their optionally -default values and user-supplied values.

-
- -
-
-extractParamMap(extra=None)
-

Extracts the embedded default param values and user-supplied -values, and then merges them with extra values from input into -a flat param map, where the latter value is used if there exist -conflicts, i.e., with ordering: default param values < -user-supplied values < extra.

-
-
Parameters
-
-
extradict, optional

extra param values

-
-
-
-
Returns
-
-
dict

merged param map

-
-
-
-
-
- -
-
-getInputCols()
-

Gets current column names of input annotations.

-
- -
-
-getLazyAnnotator()
-

Gets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
- -
-
-getOrDefault(param)
-

Gets the value of a param in the user-supplied param map or its -default value. Raises an error if neither is set.

-
- -
-
-getOutputCol()
-

Gets output column name of annotations.

-
- -
-
-getParam(paramName)
-

Gets a param by its name.

-
- -
-
-getParamValue(paramName)
-

Gets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-hasDefault(param)
-

Checks whether a param has a default value.

-
- -
-
-hasParam(paramName)
-

Tests whether this instance contains a param with a given -(string) name.

-
- -
-
-isDefined(param)
-

Checks whether a param is explicitly set by user or has -a default value.

-
- -
-
-isSet(param)
-

Checks whether a param is explicitly set by user.

-
- -
-
-classmethod load(path)
-

Reads an ML instance from the input path, a shortcut of read().load(path).

-
- -
-
-property params
-

Returns all params ordered by name. The default implementation -uses dir() to get all attributes of type -Param.

-
- -
-
-classmethod read()
-

Returns an MLReader instance for this class.

-
- -
-
-save(path)
-

Save this ML instance to the given path, a shortcut of ‘write().save(path)’.

-
- -
-
-set(param, value)
-

Sets a parameter in the embedded param map.

-
- -
-
-setInputCols(*value)
-

Sets column names of input annotations.

-
-
Parameters
-
-
*valuestr

Input columns for the annotator

-
-
-
-
-
- -
-
-setLazyAnnotator(value)
-

Sets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
-
Parameters
-
-
valuebool

Whether Annotator should be evaluated lazily in a -RecursivePipeline

-
-
-
-
-
- -
-
-setOutputCol(value)
-

Sets output column name of annotations.

-
-
Parameters
-
-
valuestr

Name of output column

-
-
-
-
-
- -
-
-setParamValue(paramName)
-

Sets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-setScheme(f)[source]
-

Sets format of tags, either IOB or BIOES

-
-
Parameters
-
-
pairsstr

Format of tags, either IOB or BIOES

-
-
-
-
-
- -
-
-transform(dataset, params=None)
-

Transforms the input dataset with optional parameters.

-
-

New in version 1.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset

-
-
paramsdict, optional

an optional param map that overrides embedded params.

-
-
-
-
Returns
-
-
pyspark.sql.DataFrame

transformed dataset

-
-
-
-
-
- -
-
-uid
-

A unique id for the object.

-
- -
-
-write()
-

Returns an MLWriter instance for this ML instance.

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.MedicalBertForSequenceClassification.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.MedicalBertForSequenceClassification.html deleted file mode 100644 index a5364a3b21..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.MedicalBertForSequenceClassification.html +++ /dev/null @@ -1,1264 +0,0 @@ - - - - - - - - sparknlp_jsl.annotator.MedicalBertForSequenceClassification — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.annotator.MedicalBertForSequenceClassification

-
-
-class sparknlp_jsl.annotator.MedicalBertForSequenceClassification(classname='com.johnsnowlabs.nlp.annotators.classification.MedicalBertForSequenceClassification', java_model=None)[source]
-

Bases: sparknlp.common.AnnotatorModel, sparknlp.common.HasCaseSensitiveProperties, sparknlp.common.HasBatchedAnnotate

-

MedicalBertForTokenClassifier can load Bert Models with sequence classification/regression head on top -(a linear layer on top of the pooled output) e.g. for multi-class document classification tasks.

-

Pretrained models can be loaded with pretrained() of the companion -object:

-

For available pretrained models please see the Models Hub.

-

Models from the HuggingFace 🤗 Transformers library are also compatible with -Spark NLP 🚀. To see which models are compatible and how to import them see -Import Transformers into Spark NLP 🚀.

- ---- - - - - - - - - - - -

Input Annotation types

Output Annotation type

DOCUMENT, TOKEN

CATEGORY

-
-
Parameters
-
-
batchSize

Batch size. Large values allows faster processing but requires more -memory, by default 8

-
-
caseSensitive

Whether to ignore case in tokens for embeddings matching, by default -True

-
-
configProtoBytes

ConfigProto from tensorflow, serialized into byte array.

-
-
maxSentenceLength

Max sentence length to process, by default 128

-
-
-
-
-

Examples

-
>>> import sparknlp
->>> from sparknlp.base import *
->>> from sparknlp.annotator import *
->>> from pyspark.ml import Pipeline
->>> documentAssembler = DocumentAssembler() \
-...     .setInputCol("text") \
-...     .setOutputCol("document")
->>> tokenizer = Tokenizer() \
-...     .setInputCols(["document"]) \
-...     .setOutputCol("token")
->>> tokenClassifier = MedicalBertForSequenceClassification.pretrained() \
-...     .setInputCols(["token", "document"]) \
-...     .setOutputCol("label") \
-...     .setCaseSensitive(True)
->>> pipeline = Pipeline().setStages([
-...     documentAssembler,
-...     tokenizer,
-...     tokenClassifier
-... ])
->>> data = spark.createDataFrame([["I felt a bit drowsy and had blurred vision after taking Aspirin."]]).toDF("text")
->>> result = pipeline.fit(data).transform(data)
->>> result.select("label.result").show(truncate=False)
-
-
-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__([classname, java_model])

Initialize this instance with a Java model object.

clear(param)

Clears a param from the param map if it has been explicitly set.

copy([extra])

Creates a copy of this instance with the same uid and some extra params.

explainParam(param)

Explains a single param and returns its name, doc, and optional default value and user-supplied value in a string.

explainParams()

Returns the documentation of all params with their optionally default values and user-supplied values.

extractParamMap([extra])

Extracts the embedded default param values and user-supplied values, and then merges them with extra values from input into a flat param map, where the latter value is used if there exist conflicts, i.e., with ordering: default param values < user-supplied values < extra.

getBatchSize()

Gets current batch size.

getCaseSensitive()

Gets whether to ignore case in tokens for embeddings matching.

getInputCols()

Gets current column names of input annotations.

getLazyAnnotator()

Gets whether Annotator should be evaluated lazily in a RecursivePipeline.

getOrDefault(param)

Gets the value of a param in the user-supplied param map or its default value.

getOutputCol()

Gets output column name of annotations.

getParam(paramName)

Gets a param by its name.

getParamValue(paramName)

Gets the value of a parameter.

hasDefault(param)

Checks whether a param has a default value.

hasParam(paramName)

Tests whether this instance contains a param with a given (string) name.

isDefined(param)

Checks whether a param is explicitly set by user or has a default value.

isSet(param)

Checks whether a param is explicitly set by user.

load(path)

Reads an ML instance from the input path, a shortcut of read().load(path).

loadSavedModel(folder, spark_session)

Loads a locally saved model Parameters ---------- folder : str Folder of the saved model spark_session : pyspark.sql.SparkSession The current SparkSession

loadSavedModelOpenSource(...)

Loads a locally saved model.

pretrained([name, lang, remote_loc])

Downloads and loads a pretrained model.

read()

Returns an MLReader instance for this class.

save(path)

Save this ML instance to the given path, a shortcut of 'write().save(path)'.

set(param, value)

Sets a parameter in the embedded param map.

setBatchSize(v)

Sets batch size.

setCaseSensitive(value)

Sets whether to ignore case in tokens for embeddings matching.

setCoalesceSentences(value)

Instead of 1 class per sentence (if inputCols is '''sentence''') output 1 class per document by averaging probabilities in all sentences.

setConfigProtoBytes(b)

Sets configProto from tensorflow, serialized into byte array.

setInputCols(*value)

Sets column names of input annotations.

setLazyAnnotator(value)

Sets whether Annotator should be evaluated lazily in a RecursivePipeline.

setMaxSentenceLength(value)

Sets max sentence length to process, by default 128.

setOutputCol(value)

Sets output column name of annotations.

setParamValue(paramName)

Sets the value of a parameter.

setParams()

transform(dataset[, params])

Transforms the input dataset with optional parameters.

write()

Returns an MLWriter instance for this ML instance.

-

Attributes

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

batchSize

caseSensitive

coalesceSentences

configProtoBytes

getter_attrs

inputCols

lazyAnnotator

maxSentenceLength

name

outputCol

params

Returns all params ordered by name.

-
-
-clear(param)
-

Clears a param from the param map if it has been explicitly set.

-
- -
-
-copy(extra=None)
-

Creates a copy of this instance with the same uid and some -extra params. This implementation first calls Params.copy and -then make a copy of the companion Java pipeline component with -extra params. So both the Python wrapper and the Java pipeline -component get copied.

-
-
Parameters
-
-
extradict, optional

Extra parameters to copy to the new instance

-
-
-
-
Returns
-
-
JavaParams

Copy of this instance

-
-
-
-
-
- -
-
-explainParam(param)
-

Explains a single param and returns its name, doc, and optional -default value and user-supplied value in a string.

-
- -
-
-explainParams()
-

Returns the documentation of all params with their optionally -default values and user-supplied values.

-
- -
-
-extractParamMap(extra=None)
-

Extracts the embedded default param values and user-supplied -values, and then merges them with extra values from input into -a flat param map, where the latter value is used if there exist -conflicts, i.e., with ordering: default param values < -user-supplied values < extra.

-
-
Parameters
-
-
extradict, optional

extra param values

-
-
-
-
Returns
-
-
dict

merged param map

-
-
-
-
-
- -
-
-getBatchSize()
-

Gets current batch size.

-
-
Returns
-
-
int

Current batch size

-
-
-
-
-
- -
-
-getCaseSensitive()
-

Gets whether to ignore case in tokens for embeddings matching.

-
-
Returns
-
-
bool

Whether to ignore case in tokens for embeddings matching

-
-
-
-
-
- -
-
-getInputCols()
-

Gets current column names of input annotations.

-
- -
-
-getLazyAnnotator()
-

Gets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
- -
-
-getOrDefault(param)
-

Gets the value of a param in the user-supplied param map or its -default value. Raises an error if neither is set.

-
- -
-
-getOutputCol()
-

Gets output column name of annotations.

-
- -
-
-getParam(paramName)
-

Gets a param by its name.

-
- -
-
-getParamValue(paramName)
-

Gets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-hasDefault(param)
-

Checks whether a param has a default value.

-
- -
-
-hasParam(paramName)
-

Tests whether this instance contains a param with a given -(string) name.

-
- -
-
-isDefined(param)
-

Checks whether a param is explicitly set by user or has -a default value.

-
- -
-
-isSet(param)
-

Checks whether a param is explicitly set by user.

-
- -
-
-classmethod load(path)
-

Reads an ML instance from the input path, a shortcut of read().load(path).

-
- -
-
-static loadSavedModel(folder, spark_session)[source]
-

Loads a locally saved model -Parameters -———- -folder : str

-
-

Folder of the saved model

-
-
-
spark_sessionpyspark.sql.SparkSession

The current SparkSession

-
-
-
-
Returns
-
-
MedicalBertForSequenceClassification

The restored model

-
-
-
-
-
- -
-
-static loadSavedModelOpenSource(bertForTokenClassifierPath, tfModelPath, spark_session)[source]
-

Loads a locally saved model.

-
-
Parameters
-
-
bertForTokenClassifierPathstr

Folder of the bertForTokenClassifier

-
-
tfModelPathstr

Folder taht contains the tf model

-
-
spark_sessionpyspark.sql.SparkSession

The current SparkSession

-
-
Returns
-
——-
-
MedicalBertForSequenceClassification

The restored model

-
-
-
-
-
- -
-
-property params
-

Returns all params ordered by name. The default implementation -uses dir() to get all attributes of type -Param.

-
- -
-
-static pretrained(name='bert_sequence_classifier_ade', lang='en', remote_loc='clinical/models')[source]
-

Downloads and loads a pretrained model.

-
-
Parameters
-
-
namestr, optional

Name of the pretrained model.

-
-
langstr, optional

Language of the pretrained model, by default “en”

-
-
remote_locstr, optional

Optional remote address of the resource, by default None. Will use -Spark NLPs repositories otherwise.

-
-
-
-
Returns
-
-
MedicalBertForSequenceClassification

The restored model

-
-
-
-
-
- -
-
-classmethod read()
-

Returns an MLReader instance for this class.

-
- -
-
-save(path)
-

Save this ML instance to the given path, a shortcut of ‘write().save(path)’.

-
- -
-
-set(param, value)
-

Sets a parameter in the embedded param map.

-
- -
-
-setBatchSize(v)
-

Sets batch size.

-
-
Parameters
-
-
vint

Batch size

-
-
-
-
-
- -
-
-setCaseSensitive(value)
-

Sets whether to ignore case in tokens for embeddings matching.

-
-
Parameters
-
-
valuebool

Whether to ignore case in tokens for embeddings matching

-
-
-
-
-
- -
-
-setCoalesceSentences(value)[source]
-

Instead of 1 class per sentence (if inputCols is ‘’’sentence’’’) output 1 class per document by averaging probabilities in all sentences. -Due to max sequence length limit in almost all transformer models such as BERT (512 tokens), this parameter helps feeding all the sentences -into the model and averaging all the probabilities for the entire document instead of probabilities per sentence. (Default: true)

-
-
Parameters
-
-
valuebool

If the output of all sentences will be averaged to one output

-
-
-
-
-
- -
-
-setConfigProtoBytes(b)[source]
-

Sets configProto from tensorflow, serialized into byte array.

-
-
Parameters
-
-
bList[str]

ConfigProto from tensorflow, serialized into byte array

-
-
-
-
-
- -
-
-setInputCols(*value)
-

Sets column names of input annotations.

-
-
Parameters
-
-
*valuestr

Input columns for the annotator

-
-
-
-
-
- -
-
-setLazyAnnotator(value)
-

Sets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
-
Parameters
-
-
valuebool

Whether Annotator should be evaluated lazily in a -RecursivePipeline

-
-
-
-
-
- -
-
-setMaxSentenceLength(value)[source]
-

Sets max sentence length to process, by default 128.

-
-
Parameters
-
-
valueint

Max sentence length to process

-
-
-
-
-
- -
-
-setOutputCol(value)
-

Sets output column name of annotations.

-
-
Parameters
-
-
valuestr

Name of output column

-
-
-
-
-
- -
-
-setParamValue(paramName)
-

Sets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-transform(dataset, params=None)
-

Transforms the input dataset with optional parameters.

-
-

New in version 1.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset

-
-
paramsdict, optional

an optional param map that overrides embedded params.

-
-
-
-
Returns
-
-
pyspark.sql.DataFrame

transformed dataset

-
-
-
-
-
- -
-
-uid
-

A unique id for the object.

-
- -
-
-write()
-

Returns an MLWriter instance for this ML instance.

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.MedicalBertForTokenClassifier.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.MedicalBertForTokenClassifier.html deleted file mode 100644 index 19c0723a8a..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.MedicalBertForTokenClassifier.html +++ /dev/null @@ -1,1256 +0,0 @@ - - - - - - - - sparknlp_jsl.annotator.MedicalBertForTokenClassifier — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.annotator.MedicalBertForTokenClassifier

-
-
-class sparknlp_jsl.annotator.MedicalBertForTokenClassifier(classname='com.johnsnowlabs.nlp.annotators.classification.MedicalBertForTokenClassifier', java_model=None)[source]
-

Bases: sparknlp.common.AnnotatorModel, sparknlp.common.HasCaseSensitiveProperties, sparknlp.common.HasBatchedAnnotate

-

MedicalBertForTokenClassifier can load Bert Models with a token -classification head on top (a linear layer on top of the hidden-states -output) e.g. for Named-Entity-Recognition (NER) tasks.

-

Pretrained models can be loaded with pretrained() of the companion -object:

-
>>> embeddings = MedicalBertForTokenClassifier.pretrained() \
-...     .setInputCols(["token", "document"]) \
-...     .setOutputCol("label")
-
-
-

The default model is "bert_token_classifier_ner_bionlp", if no name is -provided.

-

For available pretrained models please see the Models Hub.

-

Models from the HuggingFace 🤗 Transformers library are also compatible with -Spark NLP 🚀. To see which models are compatible and how to import them see -Import Transformers into Spark NLP 🚀.

- ---- - - - - - - - - - - -

Input Annotation types

Output Annotation type

DOCUMENT, TOKEN

NAMED_ENTITY

-
-
Parameters
-
-
batchSize

Batch size. Large values allows faster processing but requires more -memory, by default 8

-
-
caseSensitive

Whether to ignore case in tokens for embeddings matching, by default -True

-
-
configProtoBytes

ConfigProto from tensorflow, serialized into byte array.

-
-
maxSentenceLength

Max sentence length to process, by default 128

-
-
-
-
-

Examples

-
>>> import sparknlp
->>> from sparknlp.base import *
->>> from sparknlp.annotator import *
->>> from pyspark.ml import Pipeline
->>> documentAssembler = DocumentAssembler() \
-...     .setInputCol("text") \
-...     .setOutputCol("document")
->>> tokenizer = Tokenizer() \
-...     .setInputCols(["document"]) \
-...     .setOutputCol("token")
->>> tokenClassifier = MedicalBertForTokenClassifier.pretrained() \
-...     .setInputCols(["token", "document"]) \
-...     .setOutputCol("label") \
-...     .setCaseSensitive(True)
->>> pipeline = Pipeline().setStages([
-...     documentAssembler,
-...     tokenizer,
-...     tokenClassifier
-... ])
->>> data = spark.createDataFrame([["Both the erbA IRES and the erbA/myb virus constructs transformed erythroid cells after infection of bone marrow or blastoderm cultures."]]).toDF("text")
->>> result = pipeline.fit(data).transform(data)
->>> result.select("label.result").show(truncate=False)
-+------------------------------------------------------------------------------------+
-|result
-|
-+------------------------------------------------------------------------------------+
-|[O, O, B-Organism, I-Organism, O, O, B-Organism, I-Organism, O, O, B-Cell, I-Cell, O,
-O, O, B-Multi-tissue_structure, I-Multi-tissue_structure, O, B-Cell, I-Cell, O]|
-+------------------------------------------------------------------------------------+
-
-
-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__([classname, java_model])

Initialize this instance with a Java model object.

clear(param)

Clears a param from the param map if it has been explicitly set.

copy([extra])

Creates a copy of this instance with the same uid and some extra params.

explainParam(param)

Explains a single param and returns its name, doc, and optional default value and user-supplied value in a string.

explainParams()

Returns the documentation of all params with their optionally default values and user-supplied values.

extractParamMap([extra])

Extracts the embedded default param values and user-supplied values, and then merges them with extra values from input into a flat param map, where the latter value is used if there exist conflicts, i.e., with ordering: default param values < user-supplied values < extra.

getBatchSize()

Gets current batch size.

getCaseSensitive()

Gets whether to ignore case in tokens for embeddings matching.

getInputCols()

Gets current column names of input annotations.

getLazyAnnotator()

Gets whether Annotator should be evaluated lazily in a RecursivePipeline.

getOrDefault(param)

Gets the value of a param in the user-supplied param map or its default value.

getOutputCol()

Gets output column name of annotations.

getParam(paramName)

Gets a param by its name.

getParamValue(paramName)

Gets the value of a parameter.

hasDefault(param)

Checks whether a param has a default value.

hasParam(paramName)

Tests whether this instance contains a param with a given (string) name.

isDefined(param)

Checks whether a param is explicitly set by user or has a default value.

isSet(param)

Checks whether a param is explicitly set by user.

load(path)

Reads an ML instance from the input path, a shortcut of read().load(path).

loadSavedModel(folder, spark_session)

Loads a locally saved model.

loadSavedModelOpenSource(...)

Loads a locally saved model.

pretrained([name, lang, remote_loc])

Downloads and loads a pretrained model.

read()

Returns an MLReader instance for this class.

save(path)

Save this ML instance to the given path, a shortcut of 'write().save(path)'.

set(param, value)

Sets a parameter in the embedded param map.

setBatchSize(v)

Sets batch size.

setCaseSensitive(value)

Sets whether to ignore case in tokens for embeddings matching.

setConfigProtoBytes(b)

Sets configProto from tensorflow, serialized into byte array.

setInputCols(*value)

Sets column names of input annotations.

setLazyAnnotator(value)

Sets whether Annotator should be evaluated lazily in a RecursivePipeline.

setMaxSentenceLength(value)

Sets max sentence length to process, by default 128.

setOutputCol(value)

Sets output column name of annotations.

setParamValue(paramName)

Sets the value of a parameter.

setParams()

transform(dataset[, params])

Transforms the input dataset with optional parameters.

write()

Returns an MLWriter instance for this ML instance.

-

Attributes

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

batchSize

caseSensitive

configProtoBytes

getter_attrs

inputCols

lazyAnnotator

maxSentenceLength

name

outputCol

params

Returns all params ordered by name.

-
-
-clear(param)
-

Clears a param from the param map if it has been explicitly set.

-
- -
-
-copy(extra=None)
-

Creates a copy of this instance with the same uid and some -extra params. This implementation first calls Params.copy and -then make a copy of the companion Java pipeline component with -extra params. So both the Python wrapper and the Java pipeline -component get copied.

-
-
Parameters
-
-
extradict, optional

Extra parameters to copy to the new instance

-
-
-
-
Returns
-
-
JavaParams

Copy of this instance

-
-
-
-
-
- -
-
-explainParam(param)
-

Explains a single param and returns its name, doc, and optional -default value and user-supplied value in a string.

-
- -
-
-explainParams()
-

Returns the documentation of all params with their optionally -default values and user-supplied values.

-
- -
-
-extractParamMap(extra=None)
-

Extracts the embedded default param values and user-supplied -values, and then merges them with extra values from input into -a flat param map, where the latter value is used if there exist -conflicts, i.e., with ordering: default param values < -user-supplied values < extra.

-
-
Parameters
-
-
extradict, optional

extra param values

-
-
-
-
Returns
-
-
dict

merged param map

-
-
-
-
-
- -
-
-getBatchSize()
-

Gets current batch size.

-
-
Returns
-
-
int

Current batch size

-
-
-
-
-
- -
-
-getCaseSensitive()
-

Gets whether to ignore case in tokens for embeddings matching.

-
-
Returns
-
-
bool

Whether to ignore case in tokens for embeddings matching

-
-
-
-
-
- -
-
-getInputCols()
-

Gets current column names of input annotations.

-
- -
-
-getLazyAnnotator()
-

Gets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
- -
-
-getOrDefault(param)
-

Gets the value of a param in the user-supplied param map or its -default value. Raises an error if neither is set.

-
- -
-
-getOutputCol()
-

Gets output column name of annotations.

-
- -
-
-getParam(paramName)
-

Gets a param by its name.

-
- -
-
-getParamValue(paramName)
-

Gets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-hasDefault(param)
-

Checks whether a param has a default value.

-
- -
-
-hasParam(paramName)
-

Tests whether this instance contains a param with a given -(string) name.

-
- -
-
-isDefined(param)
-

Checks whether a param is explicitly set by user or has -a default value.

-
- -
-
-isSet(param)
-

Checks whether a param is explicitly set by user.

-
- -
-
-classmethod load(path)
-

Reads an ML instance from the input path, a shortcut of read().load(path).

-
- -
-
-static loadSavedModel(folder, spark_session)[source]
-

Loads a locally saved model.

-
-
Parameters
-
-
folderstr

Folder of the saved model

-
-
spark_sessionpyspark.sql.SparkSession

The current SparkSession

-
-
-
-
Returns
-
-
MedicalBertForTokenClassifier

The restored model

-
-
-
-
-
- -
-
-static loadSavedModelOpenSource(bertForTokenClassifierPath, tfModelPath, spark_session)[source]
-

Loads a locally saved model.

-
-
Parameters
-
-
bertForTokenClassifierPathstr

Folder of the bertForTokenClassifier

-
-
tfModelPathstr

Folder taht contains the tf model

-
-
spark_sessionpyspark.sql.SparkSession

The current SparkSession

-
-
Returns
-
——-
-
MedicalBertForTokenClassifier

The restored model

-
-
-
-
-
- -
-
-property params
-

Returns all params ordered by name. The default implementation -uses dir() to get all attributes of type -Param.

-
- -
-
-static pretrained(name='bert_token_classifier_ner_bionlp', lang='en', remote_loc='clinical/models')[source]
-

Downloads and loads a pretrained model.

-
-
Parameters
-
-
namestr, optional

Name of the pretrained model, by default -“bert_base_token_classifier_conll03” -lang : str, optional -Language of the pretrained model, by default “en” -remote_loc : str, optional -Optional remote address of the resource, by default None. Will use -Spark NLPs repositories otherwise.

-
-
-
-
Returns
-
-
MedicalBertForTokenClassifier

The restored model

-
-
-
-
-
- -
-
-classmethod read()
-

Returns an MLReader instance for this class.

-
- -
-
-save(path)
-

Save this ML instance to the given path, a shortcut of ‘write().save(path)’.

-
- -
-
-set(param, value)
-

Sets a parameter in the embedded param map.

-
- -
-
-setBatchSize(v)
-

Sets batch size.

-
-
Parameters
-
-
vint

Batch size

-
-
-
-
-
- -
-
-setCaseSensitive(value)
-

Sets whether to ignore case in tokens for embeddings matching.

-
-
Parameters
-
-
valuebool

Whether to ignore case in tokens for embeddings matching

-
-
-
-
-
- -
-
-setConfigProtoBytes(b)[source]
-

Sets configProto from tensorflow, serialized into byte array.

-
-
Parameters
-
-
bList[str]

ConfigProto from tensorflow, serialized into byte array

-
-
-
-
-
- -
-
-setInputCols(*value)
-

Sets column names of input annotations.

-
-
Parameters
-
-
*valuestr

Input columns for the annotator

-
-
-
-
-
- -
-
-setLazyAnnotator(value)
-

Sets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
-
Parameters
-
-
valuebool

Whether Annotator should be evaluated lazily in a -RecursivePipeline

-
-
-
-
-
- -
-
-setMaxSentenceLength(value)[source]
-

Sets max sentence length to process, by default 128.

-
-
Parameters
-
-
valueint

Max sentence length to process

-
-
-
-
-
- -
-
-setOutputCol(value)
-

Sets output column name of annotations.

-
-
Parameters
-
-
valuestr

Name of output column

-
-
-
-
-
- -
-
-setParamValue(paramName)
-

Sets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-transform(dataset, params=None)
-

Transforms the input dataset with optional parameters.

-
-

New in version 1.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset

-
-
paramsdict, optional

an optional param map that overrides embedded params.

-
-
-
-
Returns
-
-
pyspark.sql.DataFrame

transformed dataset

-
-
-
-
-
- -
-
-uid
-

A unique id for the object.

-
- -
-
-write()
-

Returns an MLWriter instance for this ML instance.

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.MedicalDistilBertForSequenceClassification.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.MedicalDistilBertForSequenceClassification.html deleted file mode 100644 index f879904e5d..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.MedicalDistilBertForSequenceClassification.html +++ /dev/null @@ -1,1278 +0,0 @@ - - - - - - - - sparknlp_jsl.annotator.MedicalDistilBertForSequenceClassification — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.annotator.MedicalDistilBertForSequenceClassification

-
-
-class sparknlp_jsl.annotator.MedicalDistilBertForSequenceClassification(classname='com.johnsnowlabs.nlp.annotators.classification.MedicalDistilBertForSequenceClassification', java_model=None)[source]
-

Bases: sparknlp.common.AnnotatorModel, sparknlp.common.HasCaseSensitiveProperties, sparknlp.common.HasBatchedAnnotate

-

MedicalDistilBertForSequenceClassification can load DistilBERT Models with sequence classification/regression head on -top (a linear layer on top of the pooled output) e.g. for multi-class document classification tasks.

-

Pretrained models can be loaded with pretrained() of the companion -object:

-
>>> sequenceClassifier = MedicalDistilBertForSequenceClassification.pretrained() \
-...     .setInputCols(["token", "document"]) \
-...     .setOutputCol("label")
-
-
-

Models from the HuggingFace 🤗 Transformers library are also compatible with -Spark NLP 🚀. To see which models are compatible and how to import them see -Import Transformers into Spark NLP 🚀.

- ---- - - - - - - - - - - -

Input Annotation types

Output Annotation type

DOCUMENT, TOKEN

CATEGORY

-
-
Parameters
-
-
batchSize

Batch size. Large values allows faster processing but requires more -memory, by default 8

-
-
caseSensitive

Whether to ignore case in tokens for embeddings matching, by default -True

-
-
configProtoBytes

ConfigProto from tensorflow, serialized into byte array.

-
-
maxSentenceLength

Max sentence length to process, by default 128

-
-
coalesceSentences

Instead of 1 class per sentence (if inputCols is sentence) output 1 class per document by averaging -probabilities in all sentences.

-
-
-
-
-

Examples

-
>>> import sparknlp
->>> from sparknlp.base import *
->>> from sparknlp.annotator import *
->>> from pyspark.ml import Pipeline
->>> documentAssembler = DocumentAssembler() \
-...     .setInputCol("text") \
-...     .setOutputCol("document")
->>> tokenizer = Tokenizer() \
-...     .setInputCols(["document"]) \
-...     .setOutputCol("token")
->>> sequenceClassifier = MedicalDistilBertForSequenceClassification.pretrained() \
-...     .setInputCols(["token", "document"]) \
-...     .setOutputCol("label") \
-...     .setCaseSensitive(True)
->>> pipeline = Pipeline().setStages([
-...     documentAssembler,
-...     tokenizer,
-...     sequenceClassifier
-... ])
->>> data = spark.createDataFrame([["I felt a bit drowsy and had blurred vision after taking Aspirin."]]).toDF("text")
->>> result = pipeline.fit(data).transform(data)
->>> result.select("label.result").show(truncate=False)
-
-
-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__([classname, java_model])

Initialize this instance with a Java model object.

clear(param)

Clears a param from the param map if it has been explicitly set.

copy([extra])

Creates a copy of this instance with the same uid and some extra params.

explainParam(param)

Explains a single param and returns its name, doc, and optional default value and user-supplied value in a string.

explainParams()

Returns the documentation of all params with their optionally default values and user-supplied values.

extractParamMap([extra])

Extracts the embedded default param values and user-supplied values, and then merges them with extra values from input into a flat param map, where the latter value is used if there exist conflicts, i.e., with ordering: default param values < user-supplied values < extra.

getBatchSize()

Gets current batch size.

getCaseSensitive()

Gets whether to ignore case in tokens for embeddings matching.

getClasses()

Returns labels used to train this model

getInputCols()

Gets current column names of input annotations.

getLazyAnnotator()

Gets whether Annotator should be evaluated lazily in a RecursivePipeline.

getOrDefault(param)

Gets the value of a param in the user-supplied param map or its default value.

getOutputCol()

Gets output column name of annotations.

getParam(paramName)

Gets a param by its name.

getParamValue(paramName)

Gets the value of a parameter.

hasDefault(param)

Checks whether a param has a default value.

hasParam(paramName)

Tests whether this instance contains a param with a given (string) name.

isDefined(param)

Checks whether a param is explicitly set by user or has a default value.

isSet(param)

Checks whether a param is explicitly set by user.

load(path)

Reads an ML instance from the input path, a shortcut of read().load(path).

loadSavedModel(folder, spark_session)

Loads a locally saved model.

loadSavedModelOpenSource(...)

Loads a locally saved model.

pretrained([name, lang, remote_loc])

Downloads and loads a pretrained model.

read()

Returns an MLReader instance for this class.

save(path)

Save this ML instance to the given path, a shortcut of 'write().save(path)'.

set(param, value)

Sets a parameter in the embedded param map.

setBatchSize(v)

Sets batch size.

setCaseSensitive(value)

Sets whether to ignore case in tokens for embeddings matching.

setCoalesceSentences(value)

Instead of 1 class per sentence (if inputCols is '''sentence''') output 1 class per document by averaging probabilities in all sentences.

setConfigProtoBytes(b)

Sets configProto from tensorflow, serialized into byte array.

setInputCols(*value)

Sets column names of input annotations.

setLazyAnnotator(value)

Sets whether Annotator should be evaluated lazily in a RecursivePipeline.

setMaxSentenceLength(value)

Sets max sentence length to process, by default 128.

setOutputCol(value)

Sets output column name of annotations.

setParamValue(paramName)

Sets the value of a parameter.

setParams()

transform(dataset[, params])

Transforms the input dataset with optional parameters.

write()

Returns an MLWriter instance for this ML instance.

-

Attributes

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

batchSize

caseSensitive

coalesceSentences

configProtoBytes

getter_attrs

inputCols

lazyAnnotator

maxSentenceLength

name

outputCol

params

Returns all params ordered by name.

-
-
-clear(param)
-

Clears a param from the param map if it has been explicitly set.

-
- -
-
-copy(extra=None)
-

Creates a copy of this instance with the same uid and some -extra params. This implementation first calls Params.copy and -then make a copy of the companion Java pipeline component with -extra params. So both the Python wrapper and the Java pipeline -component get copied.

-
-
Parameters
-
-
extradict, optional

Extra parameters to copy to the new instance

-
-
-
-
Returns
-
-
JavaParams

Copy of this instance

-
-
-
-
-
- -
-
-explainParam(param)
-

Explains a single param and returns its name, doc, and optional -default value and user-supplied value in a string.

-
- -
-
-explainParams()
-

Returns the documentation of all params with their optionally -default values and user-supplied values.

-
- -
-
-extractParamMap(extra=None)
-

Extracts the embedded default param values and user-supplied -values, and then merges them with extra values from input into -a flat param map, where the latter value is used if there exist -conflicts, i.e., with ordering: default param values < -user-supplied values < extra.

-
-
Parameters
-
-
extradict, optional

extra param values

-
-
-
-
Returns
-
-
dict

merged param map

-
-
-
-
-
- -
-
-getBatchSize()
-

Gets current batch size.

-
-
Returns
-
-
int

Current batch size

-
-
-
-
-
- -
-
-getCaseSensitive()
-

Gets whether to ignore case in tokens for embeddings matching.

-
-
Returns
-
-
bool

Whether to ignore case in tokens for embeddings matching

-
-
-
-
-
- -
-
-getClasses()[source]
-

Returns labels used to train this model

-
- -
-
-getInputCols()
-

Gets current column names of input annotations.

-
- -
-
-getLazyAnnotator()
-

Gets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
- -
-
-getOrDefault(param)
-

Gets the value of a param in the user-supplied param map or its -default value. Raises an error if neither is set.

-
- -
-
-getOutputCol()
-

Gets output column name of annotations.

-
- -
-
-getParam(paramName)
-

Gets a param by its name.

-
- -
-
-getParamValue(paramName)
-

Gets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-hasDefault(param)
-

Checks whether a param has a default value.

-
- -
-
-hasParam(paramName)
-

Tests whether this instance contains a param with a given -(string) name.

-
- -
-
-isDefined(param)
-

Checks whether a param is explicitly set by user or has -a default value.

-
- -
-
-isSet(param)
-

Checks whether a param is explicitly set by user.

-
- -
-
-classmethod load(path)
-

Reads an ML instance from the input path, a shortcut of read().load(path).

-
- -
-
-static loadSavedModel(folder, spark_session)[source]
-

Loads a locally saved model.

-
-
Parameters
-
-
folderstr

Folder of the saved model

-
-
spark_sessionpyspark.sql.SparkSession

The current SparkSession

-
-
-
-
Returns
-
-
DistilBertForSequenceClassification

The restored model

-
-
-
-
-
- -
-
-static loadSavedModelOpenSource(destilBertForTokenClassifierPath, tfModelPath, spark_session)[source]
-

Loads a locally saved model.

-
-
Parameters
-
-
bertForTokenClassifierPathstr

Folder of the bertForTokenClassifier

-
-
tfModelPathstr

Folder taht contains the tf model

-
-
spark_sessionpyspark.sql.SparkSession

The current SparkSession

-
-
Returns
-
——-
-
MedicalBertForSequenceClassification

The restored model

-
-
-
-
-
- -
-
-property params
-

Returns all params ordered by name. The default implementation -uses dir() to get all attributes of type -Param.

-
- -
-
-static pretrained(name='distilbert_sequence_classifier_ade', lang='en', remote_loc='clinical/models')[source]
-

Downloads and loads a pretrained model.

-
-
Parameters
-
-
namestr, optional

Name of the pretrained model, by default

-
-
langstr, optional

Language of the pretrained model, by default “en”

-
-
remote_locstr, optional

Optional remote address of the resource, by default None. Will use -Spark NLPs repositories otherwise.

-
-
-
-
Returns
-
-
MedicalBertForTokenClassifier

The restored model

-
-
-
-
-
- -
-
-classmethod read()
-

Returns an MLReader instance for this class.

-
- -
-
-save(path)
-

Save this ML instance to the given path, a shortcut of ‘write().save(path)’.

-
- -
-
-set(param, value)
-

Sets a parameter in the embedded param map.

-
- -
-
-setBatchSize(v)
-

Sets batch size.

-
-
Parameters
-
-
vint

Batch size

-
-
-
-
-
- -
-
-setCaseSensitive(value)
-

Sets whether to ignore case in tokens for embeddings matching.

-
-
Parameters
-
-
valuebool

Whether to ignore case in tokens for embeddings matching

-
-
-
-
-
- -
-
-setCoalesceSentences(value)[source]
-

Instead of 1 class per sentence (if inputCols is ‘’’sentence’’’) output 1 class per document by averaging probabilities in all sentences. -Due to max sequence length limit in almost all transformer models such as BERT (512 tokens), this parameter helps feeding all the sentences -into the model and averaging all the probabilities for the entire document instead of probabilities per sentence. (Default: true)

-
-
Parameters
-
-
valuebool

If the output of all sentences will be averaged to one output

-
-
-
-
-
- -
-
-setConfigProtoBytes(b)[source]
-

Sets configProto from tensorflow, serialized into byte array.

-
-
Parameters
-
-
bList[int]

ConfigProto from tensorflow, serialized into byte array

-
-
-
-
-
- -
-
-setInputCols(*value)
-

Sets column names of input annotations.

-
-
Parameters
-
-
*valuestr

Input columns for the annotator

-
-
-
-
-
- -
-
-setLazyAnnotator(value)
-

Sets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
-
Parameters
-
-
valuebool

Whether Annotator should be evaluated lazily in a -RecursivePipeline

-
-
-
-
-
- -
-
-setMaxSentenceLength(value)[source]
-

Sets max sentence length to process, by default 128.

-
-
Parameters
-
-
valueint

Max sentence length to process

-
-
-
-
-
- -
-
-setOutputCol(value)
-

Sets output column name of annotations.

-
-
Parameters
-
-
valuestr

Name of output column

-
-
-
-
-
- -
-
-setParamValue(paramName)
-

Sets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-transform(dataset, params=None)
-

Transforms the input dataset with optional parameters.

-
-

New in version 1.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset

-
-
paramsdict, optional

an optional param map that overrides embedded params.

-
-
-
-
Returns
-
-
pyspark.sql.DataFrame

transformed dataset

-
-
-
-
-
- -
-
-uid
-

A unique id for the object.

-
- -
-
-write()
-

Returns an MLWriter instance for this ML instance.

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.MedicalNerApproach.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.MedicalNerApproach.html deleted file mode 100644 index ea25f76581..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.MedicalNerApproach.html +++ /dev/null @@ -1,1764 +0,0 @@ - - - - - - - - sparknlp_jsl.annotator.MedicalNerApproach — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.annotator.MedicalNerApproach

-
-
-class sparknlp_jsl.annotator.MedicalNerApproach[source]
-

Bases: sparknlp.common.AnnotatorApproach, sparknlp.annotator.NerApproach

-

This Named Entity recognition annotator allows to train generic NER model -based on Neural Networks.

-

The architecture of the neural network is a Char CNNs - BiLSTM - CRF that -achieves state-of-the-art in most datasets.

-

For instantiated/pretrained models, see NerDLModel.

-

The training data should be a labeled Spark Dataset, in the format of -CoNLL 2003 IOB with Annotation type columns. The data should -have columns of type DOCUMENT, TOKEN, WORD_EMBEDDINGS and an additional -label column of annotator type NAMED_ENTITY.

-

Excluding the label, this can be done with for example:

-
    -
  • a SentenceDetector,

  • -
  • a Tokenizer and

  • -
  • a WordEmbeddingsModel (any embeddings can be chosen, e.g. BertEmbeddings -for BERT based embeddings).

  • -
-

For extended examples of usage, see the Spark NLP Workshop.

- ---- - - - - - - - - - - -

Input Annotation types

Output Annotation type

DOCUMENT, TOKEN, WORD_EMBEDDINGS

NAMED_ENTITY

-
-
Parameters
-
-
labelColumn

Column with label per each token

-
-
entities

Entities to recognize

-
-
minEpochs

Minimum number of epochs to train, by default 0

-
-
maxEpochs

Maximum number of epochs to train, by default 50

-
-
verbose

Level of verbosity during training, by default 2

-
-
randomSeed

Random seed

-
-
lr

Learning Rate, by default 0.001

-
-
po

Learning rate decay coefficient. Real Learning Rage = lr / (1 + po * -epoch), by default 0.005

-
-
batchSize

Batch size, by default 8

-
-
dropout

Dropout coefficient, by default 0.5

-
-
graphFolder

Folder path that contain external graph files

-
-
configProtoBytes

ConfigProto from tensorflow, serialized into byte array.

-
-
useContrib

whether to use contrib LSTM Cells. Not compatible with Windows. Might -slightly improve accuracy

-
-
validationSplit

Choose the proportion of training dataset to be validated against the -model on each Epoch. The value should be between 0.0 and 1.0 and by -default it is 0.0 and off, by default 0.0

-
-
evaluationLogExtended

Whether logs for validation to be extended, by default False.

-
-
testDataset

Path to test dataset. If set used to calculate statistic on it during -training.

-
-
includeConfidence

whether to include confidence scores in annotation metadata, by default -False

-
-
includeAllConfidenceScores

whether to include all confidence scores in annotation metadata or just -the score of the predicted tag, by default False

-
-
enableOutputLogs

Whether to use stdout in addition to Spark logs, by default False

-
-
outputLogsPath

Folder path to save training logs

-
-
enableMemoryOptimizer

Whether to optimize for large datasets or not. Enabling this option can -slow down training, by default False

-
-
-
-
-

Examples

-
>>> import sparknlp
->>> from sparknlp.base import *
->>> from sparknlp.common import *
->>> from sparknlp.annotator import *
->>> from sparknlp.training import *
->>> import sparknlp_jsl
->>> from sparknlp_jsl.base import *
->>> from sparknlp_jsl.annotator import *
->>> from pyspark.ml import Pipeline
-
-
-

First extract the prerequisites for the NerDLApproach

-
>>> documentAssembler = DocumentAssembler() \
-...     .setInputCol("text") \
-...     .setOutputCol("document")
->>> sentence = SentenceDetector() \
-...     .setInputCols(["document"]) \
-...     .setOutputCol("sentence")
->>> tokenizer = Tokenizer() \
-...     .setInputCols(["sentence"]) \
-...     .setOutputCol("token")
->>> embeddings = BertEmbeddings.pretrained() \
-...     .setInputCols(["sentence", "token"]) \
-...     .setOutputCol("embeddings")
-
-
-

Then the training can start

-
>>> nerTagger = MedicalNerApproach() \
-...     .setInputCols(["sentence", "token", "embeddings"]) \
-...     .setLabelColumn("label") \
-...     .setOutputCol("ner") \
-...     .setMaxEpochs(1) \
-...     .setRandomSeed(0) \
-...     .setVerbose(0)
->>> pipeline = Pipeline().setStages([
-...     documentAssembler,
-...     sentence,
-...     tokenizer,
-...     embeddings,
-...     nerTagger
-... ])
->>> conll = CoNLL()
->>> trainingData = conll.readDataset(spark, "src/test/resources/conll2003/eng.train")
->>> pipelineModel = pipeline.fit(trainingData)
-
-
-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__()

clear(param)

Clears a param from the param map if it has been explicitly set.

copy([extra])

Creates a copy of this instance with the same uid and some extra params.

explainParam(param)

Explains a single param and returns its name, doc, and optional default value and user-supplied value in a string.

explainParams()

Returns the documentation of all params with their optionally default values and user-supplied values.

extractParamMap([extra])

Extracts the embedded default param values and user-supplied values, and then merges them with extra values from input into a flat param map, where the latter value is used if there exist conflicts, i.e., with ordering: default param values < user-supplied values < extra.

fit(dataset[, params])

Fits a model to the input dataset with optional parameters.

fitMultiple(dataset, paramMaps)

Fits a model to the input dataset for each param map in paramMaps.

getInputCols()

Gets current column names of input annotations.

getLabelColumn()

Gets column for label per each token.

getLazyAnnotator()

Gets whether Annotator should be evaluated lazily in a RecursivePipeline.

getOrDefault(param)

Gets the value of a param in the user-supplied param map or its default value.

getOutputCol()

Gets output column name of annotations.

getParam(paramName)

Gets a param by its name.

getParamValue(paramName)

Gets the value of a parameter.

hasDefault(param)

Checks whether a param has a default value.

hasParam(paramName)

Tests whether this instance contains a param with a given (string) name.

isDefined(param)

Checks whether a param is explicitly set by user or has a default value.

isSet(param)

Checks whether a param is explicitly set by user.

load(path)

Reads an ML instance from the input path, a shortcut of read().load(path).

read()

Returns an MLReader instance for this class.

save(path)

Save this ML instance to the given path, a shortcut of 'write().save(path)'.

set(param, value)

Sets a parameter in the embedded param map.

setBatchSize(v)

Sets batch size, by default 64.

setConfigProtoBytes(b)

Sets configProto from tensorflow, serialized into byte array.

setDropout(v)

Sets dropout coefficient, by default 0.5.

setEarlyStoppingCriterion(criterion)

Sets early stopping criterion.

setEarlyStoppingPatience(patience)

Sets the number of epochs with no performance improvement before training is terminated.

setEnableMemoryOptimizer(value)

Sets Whether to optimize for large datasets or not, by default False.

setEnableOutputLogs(value)

Sets whether to use stdout in addition to Spark logs, by default False.

setEntities(tags)

Sets entities to recognize.

setEvaluationLogExtended(v)

Sets whether logs for validation to be extended, by default False.

setGraphFile(ff)

Sets path that contains the external graph file.

setGraphFolder(p)

Sets folder path that contain external graph files.

setIncludeAllConfidenceScores(value)

Sets whether to include all confidence scores in annotation metadata or just the score of the predicted tag, by default False.

setIncludeConfidence(value)

Sets whether to include confidence scores in annotation metadata, by default False.

setInputCols(*value)

Sets column names of input annotations.

setLabelColumn(value)

Sets name of column for data labels.

setLazyAnnotator(value)

Sets whether Annotator should be evaluated lazily in a RecursivePipeline.

setLogPrefix(s)

Sets folder path to save training logs.

setLr(v)

Sets Learning Rate, by default 0.001.

setMaxEpochs(epochs)

Sets maximum number of epochs to train.

setMinEpochs(epochs)

Sets minimum number of epochs to train.

setOutputCol(value)

Sets output column name of annotations.

setOutputLogsPath(p)

Sets folder path to save training logs.

setOverrideExistingTags(value)

Sets whether to override already learned tags when using a pretrained model to initialize the new model.

setParamValue(paramName)

Sets the value of a parameter.

setPo(v)

Sets Learning rate decay coefficient, by default 0.005.

setPretrainedModelPath(value)

Sets folder path to save training logs.

setRandomSeed(seed)

Sets random seed for shuffling.

setTagsMapping(value)

Sets a map specifying how old tags are mapped to new ones.

setTestDataset(path[, read_as, options])

Sets Path to test dataset.

setUseBestModel(value)

Sets whether to restore and use the model that has achieved the best performance at the end of the training.

setUseContrib(v)

Sets whether to use contrib LSTM Cells.

setValidationSplit(v)

Sets the proportion of training dataset to be validated against the model on each Epoch, by default it is 0.0 and off.

setVerbose(verboseValue)

Sets level of verbosity during training.

write()

Returns an MLWriter instance for this ML instance.

-

Attributes

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

batchSize

configProtoBytes

dropout

earlyStoppingCriterion

earlyStoppingPatience

enableMemoryOptimizer

enableOutputLogs

entities

evaluationLogExtended

getter_attrs

graphFile

graphFolder

includeAllConfidenceScores

includeConfidence

inputCols

labelColumn

lazyAnnotator

logPrefix

lr

maxEpochs

minEpochs

outputCol

outputLogsPath

overrideExistingTags

params

Returns all params ordered by name.

po

pretrainedModelPath

randomSeed

tagsMapping

testDataset

useBestModel

useContrib

validationSplit

verbose

-
-
-clear(param)
-

Clears a param from the param map if it has been explicitly set.

-
- -
-
-copy(extra=None)
-

Creates a copy of this instance with the same uid and some -extra params. This implementation first calls Params.copy and -then make a copy of the companion Java pipeline component with -extra params. So both the Python wrapper and the Java pipeline -component get copied.

-
-
Parameters
-
-
extradict, optional

Extra parameters to copy to the new instance

-
-
-
-
Returns
-
-
JavaParams

Copy of this instance

-
-
-
-
-
- -
-
-explainParam(param)
-

Explains a single param and returns its name, doc, and optional -default value and user-supplied value in a string.

-
- -
-
-explainParams()
-

Returns the documentation of all params with their optionally -default values and user-supplied values.

-
- -
-
-extractParamMap(extra=None)
-

Extracts the embedded default param values and user-supplied -values, and then merges them with extra values from input into -a flat param map, where the latter value is used if there exist -conflicts, i.e., with ordering: default param values < -user-supplied values < extra.

-
-
Parameters
-
-
extradict, optional

extra param values

-
-
-
-
Returns
-
-
dict

merged param map

-
-
-
-
-
- -
-
-fit(dataset, params=None)
-

Fits a model to the input dataset with optional parameters.

-
-

New in version 1.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset.

-
-
paramsdict or list or tuple, optional

an optional param map that overrides embedded params. If a list/tuple of -param maps is given, this calls fit on each param map and returns a list of -models.

-
-
-
-
Returns
-
-
Transformer or a list of Transformer

fitted model(s)

-
-
-
-
-
- -
-
-fitMultiple(dataset, paramMaps)
-

Fits a model to the input dataset for each param map in paramMaps.

-
-

New in version 2.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset.

-
-
paramMapscollections.abc.Sequence

A Sequence of param maps.

-
-
-
-
Returns
-
-
_FitMultipleIterator

A thread safe iterable which contains one model for each param map. Each -call to next(modelIterator) will return (index, model) where model was fit -using paramMaps[index]. index values may not be sequential.

-
-
-
-
-
- -
-
-getInputCols()
-

Gets current column names of input annotations.

-
- -
-
-getLabelColumn()
-

Gets column for label per each token.

-
-
Returns
-
-
str

Column with label per each token

-
-
-
-
-
- -
-
-getLazyAnnotator()
-

Gets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
- -
-
-getOrDefault(param)
-

Gets the value of a param in the user-supplied param map or its -default value. Raises an error if neither is set.

-
- -
-
-getOutputCol()
-

Gets output column name of annotations.

-
- -
-
-getParam(paramName)
-

Gets a param by its name.

-
- -
-
-getParamValue(paramName)
-

Gets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-hasDefault(param)
-

Checks whether a param has a default value.

-
- -
-
-hasParam(paramName)
-

Tests whether this instance contains a param with a given -(string) name.

-
- -
-
-isDefined(param)
-

Checks whether a param is explicitly set by user or has -a default value.

-
- -
-
-isSet(param)
-

Checks whether a param is explicitly set by user.

-
- -
-
-classmethod load(path)
-

Reads an ML instance from the input path, a shortcut of read().load(path).

-
- -
-
-property params
-

Returns all params ordered by name. The default implementation -uses dir() to get all attributes of type -Param.

-
- -
-
-classmethod read()
-

Returns an MLReader instance for this class.

-
- -
-
-save(path)
-

Save this ML instance to the given path, a shortcut of ‘write().save(path)’.

-
- -
-
-set(param, value)
-

Sets a parameter in the embedded param map.

-
- -
-
-setBatchSize(v)[source]
-

Sets batch size, by default 64.

-
-
Parameters
-
-
vint

Batch size

-
-
-
-
-
- -
-
-setConfigProtoBytes(b)[source]
-

Sets configProto from tensorflow, serialized into byte array.

-
-
Parameters
-
-
bList[str]

ConfigProto from tensorflow, serialized into byte array

-
-
-
-
-
- -
-
-setDropout(v)[source]
-

Sets dropout coefficient, by default 0.5.

-
-
Parameters
-
-
vfloat

Dropout coefficient

-
-
-
-
-
- -
-
-setEarlyStoppingCriterion(criterion)[source]
-

Sets early stopping criterion. A value 0 means no early stopping.

-
-
Parameters
-
-
criterionfloat

Early stopping criterion.

-
-
-
-
-
- -
-
-setEarlyStoppingPatience(patience)[source]
-

Sets the number of epochs with no performance improvement before training is terminated.

-
-
Parameters
-
-
patienceint

Early stopping patience.

-
-
-
-
-
- -
-
-setEnableMemoryOptimizer(value)[source]
-

Sets Whether to optimize for large datasets or not, by default False. -Enabling this option can slow down training.

-
-
Parameters
-
-
valuebool

Whether to optimize for large datasets

-
-
-
-
-
- -
-
-setEnableOutputLogs(value)[source]
-

Sets whether to use stdout in addition to Spark logs, by default -False.

-
-
Parameters
-
-
valuebool

Whether to use stdout in addition to Spark logs

-
-
-
-
-
- -
-
-setEntities(tags)
-

Sets entities to recognize.

-
-
Parameters
-
-
tagsList[str]

List of entities

-
-
-
-
-
- -
-
-setEvaluationLogExtended(v)[source]
-

Sets whether logs for validation to be extended, by default False. -Displays time and evaluation of each label.

-
-
Parameters
-
-
vbool

Whether logs for validation to be extended

-
-
-
-
-
- -
-
-setGraphFile(ff)[source]
-

Sets path that contains the external graph file. When specified, the provided file will be used, and no graph search will happen.

-
-
Parameters
-
-
pstr

Path that contains the external graph file. When specified, the provided file will be used, and no graph search will happen.

-
-
-
-
-
- -
-
-setGraphFolder(p)[source]
-

Sets folder path that contain external graph files.

-
-
Parameters
-
-
pstr

Folder path that contain external graph files

-
-
-
-
-
- -
-
-setIncludeAllConfidenceScores(value)[source]
-

Sets whether to include all confidence scores in annotation metadata -or just the score of the predicted tag, by default False.

-
-
Parameters
-
-
valuebool

Whether to include all confidence scores in annotation metadata or -just the score of the predicted tag

-
-
-
-
-
- -
-
-setIncludeConfidence(value)[source]
-

Sets whether to include confidence scores in annotation metadata, by -default False.

-
-
Parameters
-
-
valuebool

Whether to include the confidence value in the output.

-
-
-
-
-
- -
-
-setInputCols(*value)
-

Sets column names of input annotations.

-
-
Parameters
-
-
*valuestr

Input columns for the annotator

-
-
-
-
-
- -
-
-setLabelColumn(value)
-

Sets name of column for data labels.

-
-
Parameters
-
-
valuestr

Column for data labels

-
-
-
-
-
- -
-
-setLazyAnnotator(value)
-

Sets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
-
Parameters
-
-
valuebool

Whether Annotator should be evaluated lazily in a -RecursivePipeline

-
-
-
-
-
- -
-
-setLogPrefix(s)[source]
-

Sets folder path to save training logs.

-
-
Parameters
-
-
pstr

Folder path to save training logs

-
-
-
-
-
- -
-
-setLr(v)[source]
-

Sets Learning Rate, by default 0.001.

-
-
Parameters
-
-
vfloat

Learning Rate

-
-
-
-
-
- -
-
-setMaxEpochs(epochs)
-

Sets maximum number of epochs to train.

-
-
Parameters
-
-
epochsint

Maximum number of epochs to train

-
-
-
-
-
- -
-
-setMinEpochs(epochs)
-

Sets minimum number of epochs to train.

-
-
Parameters
-
-
epochsint

Minimum number of epochs to train

-
-
-
-
-
- -
-
-setOutputCol(value)
-

Sets output column name of annotations.

-
-
Parameters
-
-
valuestr

Name of output column

-
-
-
-
-
- -
-
-setOutputLogsPath(p)[source]
-

Sets folder path to save training logs.

-
-
Parameters
-
-
pstr

Folder path to save training logs

-
-
-
-
-
- -
-
-setOverrideExistingTags(value)[source]
-

Sets whether to override already learned tags when using a pretrained model to initialize the new model. Default is ‘true’

-
-
Parameters
-
-
valuebool

Whether to override already learned tags when using a pretrained model to initialize the new model. Default is ‘true’

-
-
-
-
-
- -
-
-setParamValue(paramName)
-

Sets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-setPo(v)[source]
-

Sets Learning rate decay coefficient, by default 0.005.

-

Real Learning Rage is lr / (1 + po * epoch).

-
-
Parameters
-
-
vfloat

Learning rate decay coefficient

-
-
-
-
-
- -
-
-setPretrainedModelPath(value)[source]
-

Sets folder path to save training logs.

-
-
Parameters
-
-
valuestr

Path to an already trained MedicalNerModel, which is used as a starting point for training the new model.

-
-
-
-
-
- -
-
-setRandomSeed(seed)
-

Sets random seed for shuffling.

-
-
Parameters
-
-
seedint

Random seed for shuffling

-
-
-
-
-
- -
-
-setTagsMapping(value)[source]
-

Sets a map specifying how old tags are mapped to new ones. It only works if setOverrideExistingTags

-
-
Parameters
-
-
valuelist

A map specifying how old tags are mapped to new ones. It only works if setOverrideExistingTags

-
-
-
-
-
- -
-
-setTestDataset(path, read_as='SPARK', options={'format': 'parquet'})[source]
-

Sets Path to test dataset. If set used to calculate statistic on it -during training.

-
-
Parameters
-
-
pathstr

Path to test dataset

-
-
read_asstr, optional

How to read the resource, by default ReadAs.SPARK

-
-
optionsdict, optional

Options for reading the resource, by default {“format”: “parquet”}

-
-
-
-
-
- -
-
-setUseBestModel(value)[source]
-

Sets whether to restore and use the model that has achieved the best performance at the end of the training.. -The metric that is being monitored is macro F1 for the following cases(highest precendence first),

-
-
Parameters
-
-
valuebool

Whether to return the model that has achieved the best metrics across epochs.

-
-
-
-
-
- -
-
-setUseContrib(v)[source]
-

Sets whether to use contrib LSTM Cells. Not compatible with Windows. -Might slightly improve accuracy.

-
-
Parameters
-
-
vbool

Whether to use contrib LSTM Cells

-
-
-
-
Raises
-
-
Exception

Windows not supported to use contrib

-
-
-
-
-
- -
-
-setValidationSplit(v)[source]
-

Sets the proportion of training dataset to be validated against the -model on each Epoch, by default it is 0.0 and off. The value should be -between 0.0 and 1.0.

-
-
Parameters
-
-
vfloat

Proportion of training dataset to be validated

-
-
-
-
-
- -
-
-setVerbose(verboseValue)
-

Sets level of verbosity during training.

-
-
Parameters
-
-
verboseValueint

Level of verbosity

-
-
-
-
-
- -
-
-uid
-

A unique id for the object.

-
- -
-
-write()
-

Returns an MLWriter instance for this ML instance.

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.MedicalNerModel.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.MedicalNerModel.html deleted file mode 100644 index 7833f017ed..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.MedicalNerModel.html +++ /dev/null @@ -1,1251 +0,0 @@ - - - - - - - - sparknlp_jsl.annotator.MedicalNerModel — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.annotator.MedicalNerModel

-
-
-class sparknlp_jsl.annotator.MedicalNerModel(classname='com.johnsnowlabs.nlp.annotators.ner.MedicalNerModel', java_model=None)[source]
-

Bases: sparknlp.common.AnnotatorModel, sparknlp.common.HasStorageRef, sparknlp.common.HasBatchedAnnotate

-

This Named Entity recognition annotator is a generic NER model based on -Neural Networks.

-

Neural Network architecture is Char CNNs - BiLSTM - CRF that achieves -state-of-the-art in most datasets.

-

This is the instantiated model of the NerDLApproach. For training -your own model, please see the documentation of that class.

-

Pretrained models can be loaded with pretrained() of the companion -object:

-
>>> nerModel = MedicalNerDLModel.pretrained() \
-...     .setInputCols(["sentence", "token", "embeddings"]) \
-...     .setOutputCol("ner")
-
-
-

The default model is "ner_dl", if no name is provided.

-

For available pretrained models please see the Models Hub. -Additionally, pretrained pipelines are available for this module, see -Pipelines.

-

Note that some pretrained models require specific types of embeddings, -depending on which they were trained on. For example, the default model -"ner_dl" requires the WordEmbeddings "glove_100d".

-

For extended examples of usage, see the Spark NLP Workshop.

- ---- - - - - - - - - - - -

Input Annotation types

Output Annotation type

DOCUMENT, TOKEN, WORD_EMBEDDINGS

NAMED_ENTITY

-
-
Parameters
-
-
batchSize

Size of every batch, by default 8

-
-
configProtoBytes

ConfigProto from tensorflow, serialized into byte array.

-
-
includeConfidence

Whether to include confidence scores in annotation metadata, by default -False

-
-
includeAllConfidenceScores

Whether to include all confidence scores in annotation metadata or just -the score of the predicted tag, by default False

-
-
inferenceBatchSize

Number of sentences to process in a single batch during inference

-
-
classes

Tags used to trained this NerDLModel

-
-
labelCasing:

Setting all labels of the NER models upper/lower case. values upper|lower

-
-
-
-
-

Examples

-
>>> import sparknlp
->>> from sparknlp.base import *
->>> from sparknlp.common import *
->>> from sparknlp.annotator import *
->>> from sparknlp.training import *
->>> import sparknlp_jsl
->>> from sparknlp_jsl.base import *
->>> from sparknlp_jsl.annotator import *
->>> from pyspark.ml import Pipeline
->>> documentAssembler = DocumentAssembler() \
-...     .setInputCol("text") \
-...     .setOutputCol("document")
->>> sentence = SentenceDetector() \
-...     .setInputCols(["document"]) \
-...     .setOutputCol("sentence")
->>> tokenizer = Tokenizer() \
-...     .setInputCols(["sentence"]) \
-...     .setOutputCol("token")
->>> embeddings = WordEmbeddingsModel.pretrained() \
-...     .setInputCols(["sentence", "token"]) \
-...     .setOutputCol("bert")
->>> nerTagger = MedicalNerDLModel.pretrained() \
-...     .setInputCols(["sentence", "token", "bert"]) \
-...     .setOutputCol("ner")
->>> pipeline = Pipeline().setStages([
-...     documentAssembler,
-...     sentence,
-...     tokenizer,
-...     embeddings,
-...     nerTagger
-... ])
->>> data = spark.createDataFrame([["U.N. official Ekeus heads for Baghdad."]]).toDF("text")
->>> result = pipeline.fit(data).transform(data)
-
-
-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__([classname, java_model])

Initialize this instance with a Java model object.

clear(param)

Clears a param from the param map if it has been explicitly set.

copy([extra])

Creates a copy of this instance with the same uid and some extra params.

explainParam(param)

Explains a single param and returns its name, doc, and optional default value and user-supplied value in a string.

explainParams()

Returns the documentation of all params with their optionally default values and user-supplied values.

extractParamMap([extra])

Extracts the embedded default param values and user-supplied values, and then merges them with extra values from input into a flat param map, where the latter value is used if there exist conflicts, i.e., with ordering: default param values < user-supplied values < extra.

getBatchSize()

Gets current batch size.

getInputCols()

Gets current column names of input annotations.

getLazyAnnotator()

Gets whether Annotator should be evaluated lazily in a RecursivePipeline.

getOrDefault(param)

Gets the value of a param in the user-supplied param map or its default value.

getOutputCol()

Gets output column name of annotations.

getParam(paramName)

Gets a param by its name.

getParamValue(paramName)

Gets the value of a parameter.

getStorageRef()

Gets unique reference name for identification.

getTrainingClassDistribution()

hasDefault(param)

Checks whether a param has a default value.

hasParam(paramName)

Tests whether this instance contains a param with a given (string) name.

isDefined(param)

Checks whether a param is explicitly set by user or has a default value.

isSet(param)

Checks whether a param is explicitly set by user.

load(path)

Reads an ML instance from the input path, a shortcut of read().load(path).

loadSavedModel(ner_model_path, folder, ...)

pretrained([name, lang, remote_loc])

read()

Returns an MLReader instance for this class.

save(path)

Save this ML instance to the given path, a shortcut of 'write().save(path)'.

set(param, value)

Sets a parameter in the embedded param map.

setBatchSize(v)

Sets batch size.

setConfigProtoBytes(b)

Sets configProto from tensorflow, serialized into byte array.

setIncludeConfidence(value)

Sets whether to include confidence scores in annotation metadata, by default False.

setInferenceBatchSize(value)

Sets number of sentences to process in a single batch during inference

setInputCols(*value)

Sets column names of input annotations.

setLabelCasing(value)

Setting all labels of the NER models upper/lower case.

setLazyAnnotator(value)

Sets whether Annotator should be evaluated lazily in a RecursivePipeline.

setOutputCol(value)

Sets output column name of annotations.

setParamValue(paramName)

Sets the value of a parameter.

setParams()

setStorageRef(value)

Sets unique reference name for identification.

transform(dataset[, params])

Transforms the input dataset with optional parameters.

write()

Returns an MLWriter instance for this ML instance.

-

Attributes

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

batchSize

classes

configProtoBytes

getter_attrs

includeAllConfidenceScores

includeConfidence

inferenceBatchSize

inputCols

labelCasing

lazyAnnotator

name

outputCol

params

Returns all params ordered by name.

storageRef

trainingClassDistribution

-
-
-clear(param)
-

Clears a param from the param map if it has been explicitly set.

-
- -
-
-copy(extra=None)
-

Creates a copy of this instance with the same uid and some -extra params. This implementation first calls Params.copy and -then make a copy of the companion Java pipeline component with -extra params. So both the Python wrapper and the Java pipeline -component get copied.

-
-
Parameters
-
-
extradict, optional

Extra parameters to copy to the new instance

-
-
-
-
Returns
-
-
JavaParams

Copy of this instance

-
-
-
-
-
- -
-
-explainParam(param)
-

Explains a single param and returns its name, doc, and optional -default value and user-supplied value in a string.

-
- -
-
-explainParams()
-

Returns the documentation of all params with their optionally -default values and user-supplied values.

-
- -
-
-extractParamMap(extra=None)
-

Extracts the embedded default param values and user-supplied -values, and then merges them with extra values from input into -a flat param map, where the latter value is used if there exist -conflicts, i.e., with ordering: default param values < -user-supplied values < extra.

-
-
Parameters
-
-
extradict, optional

extra param values

-
-
-
-
Returns
-
-
dict

merged param map

-
-
-
-
-
- -
-
-getBatchSize()
-

Gets current batch size.

-
-
Returns
-
-
int

Current batch size

-
-
-
-
-
- -
-
-getInputCols()
-

Gets current column names of input annotations.

-
- -
-
-getLazyAnnotator()
-

Gets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
- -
-
-getOrDefault(param)
-

Gets the value of a param in the user-supplied param map or its -default value. Raises an error if neither is set.

-
- -
-
-getOutputCol()
-

Gets output column name of annotations.

-
- -
-
-getParam(paramName)
-

Gets a param by its name.

-
- -
-
-getParamValue(paramName)
-

Gets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-getStorageRef()
-

Gets unique reference name for identification.

-
-
Returns
-
-
str

Unique reference name for identification

-
-
-
-
-
- -
-
-hasDefault(param)
-

Checks whether a param has a default value.

-
- -
-
-hasParam(paramName)
-

Tests whether this instance contains a param with a given -(string) name.

-
- -
-
-isDefined(param)
-

Checks whether a param is explicitly set by user or has -a default value.

-
- -
-
-isSet(param)
-

Checks whether a param is explicitly set by user.

-
- -
-
-classmethod load(path)
-

Reads an ML instance from the input path, a shortcut of read().load(path).

-
- -
-
-property params
-

Returns all params ordered by name. The default implementation -uses dir() to get all attributes of type -Param.

-
- -
-
-classmethod read()
-

Returns an MLReader instance for this class.

-
- -
-
-save(path)
-

Save this ML instance to the given path, a shortcut of ‘write().save(path)’.

-
- -
-
-set(param, value)
-

Sets a parameter in the embedded param map.

-
- -
-
-setBatchSize(v)
-

Sets batch size.

-
-
Parameters
-
-
vint

Batch size

-
-
-
-
-
- -
-
-setConfigProtoBytes(b)[source]
-

Sets configProto from tensorflow, serialized into byte array.

-
-
Parameters
-
-
bList[str]

ConfigProto from tensorflow, serialized into byte array

-
-
-
-
-
- -
-
-setIncludeConfidence(value)[source]
-

Sets whether to include confidence scores in annotation metadata, by -default False.

-
-
Parameters
-
-
valuebool

Whether to include the confidence value in the output.

-
-
-
-
-
- -
-
-setInferenceBatchSize(value)[source]
-

Sets number of sentences to process in a single batch during inference

-
-
Parameters
-
-
valueint

number of sentences to process in a single batch during inference

-
-
-
-
-
- -
-
-setInputCols(*value)
-

Sets column names of input annotations.

-
-
Parameters
-
-
*valuestr

Input columns for the annotator

-
-
-
-
-
- -
-
-setLabelCasing(value)[source]
-

Setting all labels of the NER models upper/lower case. values upper|lower

-
-
Parameters
-
-
valuestr

Setting all labels of the NER models upper/lower case. values upper|lower

-
-
-
-
-
- -
-
-setLazyAnnotator(value)
-

Sets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
-
Parameters
-
-
valuebool

Whether Annotator should be evaluated lazily in a -RecursivePipeline

-
-
-
-
-
- -
-
-setOutputCol(value)
-

Sets output column name of annotations.

-
-
Parameters
-
-
valuestr

Name of output column

-
-
-
-
-
- -
-
-setParamValue(paramName)
-

Sets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-setStorageRef(value)
-

Sets unique reference name for identification.

-
-
Parameters
-
-
valuestr

Unique reference name for identification

-
-
-
-
-
- -
-
-transform(dataset, params=None)
-

Transforms the input dataset with optional parameters.

-
-

New in version 1.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset

-
-
paramsdict, optional

an optional param map that overrides embedded params.

-
-
-
-
Returns
-
-
pyspark.sql.DataFrame

transformed dataset

-
-
-
-
-
- -
-
-uid
-

A unique id for the object.

-
- -
-
-write()
-

Returns an MLWriter instance for this ML instance.

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.NerChunker.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.NerChunker.html deleted file mode 100644 index 9c012212fb..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.NerChunker.html +++ /dev/null @@ -1,1080 +0,0 @@ - - - - - - - - sparknlp_jsl.annotator.NerChunker — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.annotator.NerChunker

-
-
-class sparknlp_jsl.annotator.NerChunker(classname='com.johnsnowlabs.nlp.annotators.ner.NerChunker', java_model=None)[source]
-

Bases: sparknlp.common.AnnotatorModel

-
-
Extracts phrases that fits into a known pattern using the NER tags. Useful for entity groups with neighboring tokens

when there is no pretrained NER model to address certain issues. A Regex needs to be provided to extract the tokens -between entities.

-
-
- ---- - - - - - - - - - - -

Input Annotation types

Output Annotation type

DOCUMENT, CHUNK

NAMED_ENTITY

-
-
Parameters
-
-
setRegexParsers

A list of regex patterns to match chunks, for example: [“‹DT›?‹JJ›*‹NN”]

-
-
-
-
-

Examples

-
>>> import sparknlp
->>> from sparknlp.base import *
->>> from sparknlp_jsl.base import *
->>> from sparknlp.annotator import *
->>> from sparknlp_jsl.annotator import *
->>> from sparknlp.training import *
->>> from pyspark.ml import Pipeline
-
-
-
>>> document_assembler = DocumentAssembler() \
-...    .setInputCol("text") \
-...    .setOutputCol("document")
-...
->>> sentence_detector = SentenceDetector() \
-...    .setInputCol("document") \
-...    .setOutputCol("sentence")
-...
->>> tokenizer = Tokenizer() \
-...    .setInputCols(["sentence"]) \
-...    .setOutputCol("token")
-...
->>> embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") \
-...    .setInputCols(["sentence", "token"]) \
-...    .setOutputCol("embeddings")     ...    .setCaseSensitive(False)
-...
->>> ner = MedicalNerModel.pretrained("ner_radiology", "en", "clinical/models") \
-...    .setInputCols(["sentence", "token","embeddings"]) \
-...    .setOutputCol("ner")     ...    .setCaseSensitive(False)
-...
->>> chunker = NerChunker() \
-...    .setInputCols(["sentence","ner"]) \
-...    .setChunkCol("ner_chunk") \
-...    .setOutputCol("chunk")
-...    .setRegexParsers(Array("<ImagingFindings>.*<BodyPart>"))
-...
-...
->>> pipeline = Pipeline(stages=[
-...    document_assembler,
-...    sentence_detector,
-...    tokenizer,
-...    embeddings,
-...    ner,
-...    chunker
-...])
->>> result = pipeline.fit.fit(dataset).transform(dataset)
-
-
-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__([classname, java_model])

Initialize this instance with a Java model object.

clear(param)

Clears a param from the param map if it has been explicitly set.

copy([extra])

Creates a copy of this instance with the same uid and some extra params.

explainParam(param)

Explains a single param and returns its name, doc, and optional default value and user-supplied value in a string.

explainParams()

Returns the documentation of all params with their optionally default values and user-supplied values.

extractParamMap([extra])

Extracts the embedded default param values and user-supplied values, and then merges them with extra values from input into a flat param map, where the latter value is used if there exist conflicts, i.e., with ordering: default param values < user-supplied values < extra.

getInputCols()

Gets current column names of input annotations.

getLazyAnnotator()

Gets whether Annotator should be evaluated lazily in a RecursivePipeline.

getOrDefault(param)

Gets the value of a param in the user-supplied param map or its default value.

getOutputCol()

Gets output column name of annotations.

getParam(paramName)

Gets a param by its name.

getParamValue(paramName)

Gets the value of a parameter.

hasDefault(param)

Checks whether a param has a default value.

hasParam(paramName)

Tests whether this instance contains a param with a given (string) name.

isDefined(param)

Checks whether a param is explicitly set by user or has a default value.

isSet(param)

Checks whether a param is explicitly set by user.

load(path)

Reads an ML instance from the input path, a shortcut of read().load(path).

read()

Returns an MLReader instance for this class.

save(path)

Save this ML instance to the given path, a shortcut of 'write().save(path)'.

set(param, value)

Sets a parameter in the embedded param map.

setInputCols(*value)

Sets column names of input annotations.

setLazyAnnotator(value)

Sets whether Annotator should be evaluated lazily in a RecursivePipeline.

setOutputCol(value)

Sets output column name of annotations.

setParamValue(paramName)

Sets the value of a parameter.

setParams()

setRegexParsers(b)

Sets list of regex patterns to match chunks, for example: Array(“‹DT›?‹JJ›*‹NN›”

transform(dataset[, params])

Transforms the input dataset with optional parameters.

write()

Returns an MLWriter instance for this ML instance.

-

Attributes

- ---- - - - - - - - - - - - - - - - - - - - - - - - -

getter_attrs

inputCols

lazyAnnotator

name

outputCol

params

Returns all params ordered by name.

regexParsers

-
-
-clear(param)
-

Clears a param from the param map if it has been explicitly set.

-
- -
-
-copy(extra=None)
-

Creates a copy of this instance with the same uid and some -extra params. This implementation first calls Params.copy and -then make a copy of the companion Java pipeline component with -extra params. So both the Python wrapper and the Java pipeline -component get copied.

-
-
Parameters
-
-
extradict, optional

Extra parameters to copy to the new instance

-
-
-
-
Returns
-
-
JavaParams

Copy of this instance

-
-
-
-
-
- -
-
-explainParam(param)
-

Explains a single param and returns its name, doc, and optional -default value and user-supplied value in a string.

-
- -
-
-explainParams()
-

Returns the documentation of all params with their optionally -default values and user-supplied values.

-
- -
-
-extractParamMap(extra=None)
-

Extracts the embedded default param values and user-supplied -values, and then merges them with extra values from input into -a flat param map, where the latter value is used if there exist -conflicts, i.e., with ordering: default param values < -user-supplied values < extra.

-
-
Parameters
-
-
extradict, optional

extra param values

-
-
-
-
Returns
-
-
dict

merged param map

-
-
-
-
-
- -
-
-getInputCols()
-

Gets current column names of input annotations.

-
- -
-
-getLazyAnnotator()
-

Gets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
- -
-
-getOrDefault(param)
-

Gets the value of a param in the user-supplied param map or its -default value. Raises an error if neither is set.

-
- -
-
-getOutputCol()
-

Gets output column name of annotations.

-
- -
-
-getParam(paramName)
-

Gets a param by its name.

-
- -
-
-getParamValue(paramName)
-

Gets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-hasDefault(param)
-

Checks whether a param has a default value.

-
- -
-
-hasParam(paramName)
-

Tests whether this instance contains a param with a given -(string) name.

-
- -
-
-isDefined(param)
-

Checks whether a param is explicitly set by user or has -a default value.

-
- -
-
-isSet(param)
-

Checks whether a param is explicitly set by user.

-
- -
-
-classmethod load(path)
-

Reads an ML instance from the input path, a shortcut of read().load(path).

-
- -
-
-property params
-

Returns all params ordered by name. The default implementation -uses dir() to get all attributes of type -Param.

-
- -
-
-classmethod read()
-

Returns an MLReader instance for this class.

-
- -
-
-save(path)
-

Save this ML instance to the given path, a shortcut of ‘write().save(path)’.

-
- -
-
-set(param, value)
-

Sets a parameter in the embedded param map.

-
- -
-
-setInputCols(*value)
-

Sets column names of input annotations.

-
-
Parameters
-
-
*valuestr

Input columns for the annotator

-
-
-
-
-
- -
-
-setLazyAnnotator(value)
-

Sets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
-
Parameters
-
-
valuebool

Whether Annotator should be evaluated lazily in a -RecursivePipeline

-
-
-
-
-
- -
-
-setOutputCol(value)
-

Sets output column name of annotations.

-
-
Parameters
-
-
valuestr

Name of output column

-
-
-
-
-
- -
-
-setParamValue(paramName)
-

Sets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-setRegexParsers(b)[source]
-

Sets list of regex patterns to match chunks, for example: Array(“‹DT›?‹JJ›*‹NN›”

-
-
Parameters
-
-
bList[String]

list of regex patterns to match chunks, for example: Array(“‹DT›?‹JJ›*‹NN›”

-
-
-
-
-
- -
-
-transform(dataset, params=None)
-

Transforms the input dataset with optional parameters.

-
-

New in version 1.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset

-
-
paramsdict, optional

an optional param map that overrides embedded params.

-
-
-
-
Returns
-
-
pyspark.sql.DataFrame

transformed dataset

-
-
-
-
-
- -
-
-uid
-

A unique id for the object.

-
- -
-
-write()
-

Returns an MLWriter instance for this ML instance.

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.NerConverterInternal.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.NerConverterInternal.html deleted file mode 100644 index c6ec1fd043..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.NerConverterInternal.html +++ /dev/null @@ -1,1170 +0,0 @@ - - - - - - - - sparknlp_jsl.annotator.NerConverterInternal — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.annotator.NerConverterInternal

-
-
-class sparknlp_jsl.annotator.NerConverterInternal[source]
-

Bases: sparknlp.common.AnnotatorModel

-

Converts a IOB or IOB2 representation of NER to a user-friendly one, -by associating the tokens of recognized entities and their label. -Chunks with no associated entity (tagged “O”) are filtered.

- ---- - - - - - - - - - - -

Input Annotation types

Output Annotation type

DOCUMENT, TOKEN, NAMED_ENTITY

CHUNK

-
-
Parameters
-
-
whiteList

If defined, list of entities to process. The rest will be ignored. Do not include IOB prefix on labels

-
-
blackList

If defined, list of entities to ignore. The rest will be proccessed. Do not include IOB prefix on labels

-
-
preservePosition

Whether to preserve the original position of the tokens in the original document or use the modified tokens

-
-
greedyMode

Whether to ignore B tags for contiguous tokens of same entity same

-
-
threshold

Confidence threshold to filter the chunk entities.

-
-
-
-
-

Examples

-
>>> import sparknlp
->>> from sparknlp.base import *
->>> from sparknlp.common import *
->>> from sparknlp.annotator import *
->>> from sparknlp.training import *
->>> import sparknlp_jsl
->>> from sparknlp_jsl.base import *
->>> from sparknlp_jsl.annotator import *
->>> from pyspark.ml import Pipeline
->>> documentAssembler = DocumentAssembler() \
->>> data = spark.createDataFrame([["A 63-year-old man presents to the hospital ..."]]).toDF("text")
->>> documentAssembler = DocumentAssembler().setInputCol("text").setOutputCol("document")
->>> sentenceDetector = SentenceDetector().setInputCols(["document"]).setOutputCol("sentence")
->>> tokenizer = Tokenizer().setInputCols(["sentence"]).setOutputCol("token")
->>> embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models").setOutputCol("embs")
->>> nerModel = MedicalNerModel.pretrained("ner_jsl", "en", "clinical/models").setInputCols(["sentence", "token", "embs"]).setOutputCol("ner")
->>> nerConverter = NerConverterInternal().setInputCols(["sentence", "token", "ner"]).setOutputCol("ner_chunk")
-...
->>> pipeline = Pipeline(stages=[
-...     documentAssembler,
-...     sentenceDetector,
-...     tokenizer,
-...     embeddings,
-...     nerModel,
-...     nerConverter])
-
-
-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__()

Initialize this instance with a Java model object.

clear(param)

Clears a param from the param map if it has been explicitly set.

copy([extra])

Creates a copy of this instance with the same uid and some extra params.

explainParam(param)

Explains a single param and returns its name, doc, and optional default value and user-supplied value in a string.

explainParams()

Returns the documentation of all params with their optionally default values and user-supplied values.

extractParamMap([extra])

Extracts the embedded default param values and user-supplied values, and then merges them with extra values from input into a flat param map, where the latter value is used if there exist conflicts, i.e., with ordering: default param values < user-supplied values < extra.

getInputCols()

Gets current column names of input annotations.

getLazyAnnotator()

Gets whether Annotator should be evaluated lazily in a RecursivePipeline.

getOrDefault(param)

Gets the value of a param in the user-supplied param map or its default value.

getOutputCol()

Gets output column name of annotations.

getParam(paramName)

Gets a param by its name.

getParamValue(paramName)

Gets the value of a parameter.

hasDefault(param)

Checks whether a param has a default value.

hasParam(paramName)

Tests whether this instance contains a param with a given (string) name.

isDefined(param)

Checks whether a param is explicitly set by user or has a default value.

isSet(param)

Checks whether a param is explicitly set by user.

load(path)

Reads an ML instance from the input path, a shortcut of read().load(path).

read()

Returns an MLReader instance for this class.

save(path)

Save this ML instance to the given path, a shortcut of 'write().save(path)'.

set(param, value)

Sets a parameter in the embedded param map.

setBlackList(entities)

If defined, list of entities to ignore.

setGreedyMode(p)

Sets whether to ignore B tags for contiguous tokens of same entity same

setInputCols(*value)

Sets column names of input annotations.

setLazyAnnotator(value)

Sets whether Annotator should be evaluated lazily in a RecursivePipeline.

setOutputCol(value)

Sets output column name of annotations.

setParamValue(paramName)

Sets the value of a parameter.

setParams()

setPreservePosition(p)

Sets whether to preserve the original position of the tokens in the original document or use the modified tokens

setReplaceDictResource(path[, read_as, options])

Sets replace dictionary pairs

setThreshold(p)

Sets confidence threshold to filter the chunk entities.

setWhiteList(entities)

If defined, list of entities to process.

transform(dataset[, params])

Transforms the input dataset with optional parameters.

write()

Returns an MLWriter instance for this ML instance.

-

Attributes

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

blackList

getter_attrs

greedyMode

inputCols

lazyAnnotator

name

outputCol

params

Returns all params ordered by name.

preservePosition

replaceDictResource

threshold

whiteList

-
-
-clear(param)
-

Clears a param from the param map if it has been explicitly set.

-
- -
-
-copy(extra=None)
-

Creates a copy of this instance with the same uid and some -extra params. This implementation first calls Params.copy and -then make a copy of the companion Java pipeline component with -extra params. So both the Python wrapper and the Java pipeline -component get copied.

-
-
Parameters
-
-
extradict, optional

Extra parameters to copy to the new instance

-
-
-
-
Returns
-
-
JavaParams

Copy of this instance

-
-
-
-
-
- -
-
-explainParam(param)
-

Explains a single param and returns its name, doc, and optional -default value and user-supplied value in a string.

-
- -
-
-explainParams()
-

Returns the documentation of all params with their optionally -default values and user-supplied values.

-
- -
-
-extractParamMap(extra=None)
-

Extracts the embedded default param values and user-supplied -values, and then merges them with extra values from input into -a flat param map, where the latter value is used if there exist -conflicts, i.e., with ordering: default param values < -user-supplied values < extra.

-
-
Parameters
-
-
extradict, optional

extra param values

-
-
-
-
Returns
-
-
dict

merged param map

-
-
-
-
-
- -
-
-getInputCols()
-

Gets current column names of input annotations.

-
- -
-
-getLazyAnnotator()
-

Gets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
- -
-
-getOrDefault(param)
-

Gets the value of a param in the user-supplied param map or its -default value. Raises an error if neither is set.

-
- -
-
-getOutputCol()
-

Gets output column name of annotations.

-
- -
-
-getParam(paramName)
-

Gets a param by its name.

-
- -
-
-getParamValue(paramName)
-

Gets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-hasDefault(param)
-

Checks whether a param has a default value.

-
- -
-
-hasParam(paramName)
-

Tests whether this instance contains a param with a given -(string) name.

-
- -
-
-isDefined(param)
-

Checks whether a param is explicitly set by user or has -a default value.

-
- -
-
-isSet(param)
-

Checks whether a param is explicitly set by user.

-
- -
-
-classmethod load(path)
-

Reads an ML instance from the input path, a shortcut of read().load(path).

-
- -
-
-property params
-

Returns all params ordered by name. The default implementation -uses dir() to get all attributes of type -Param.

-
- -
-
-classmethod read()
-

Returns an MLReader instance for this class.

-
- -
-
-save(path)
-

Save this ML instance to the given path, a shortcut of ‘write().save(path)’.

-
- -
-
-set(param, value)
-

Sets a parameter in the embedded param map.

-
- -
-
-setBlackList(entities)[source]
-

If defined, list of entities to ignore. The rest will be processed. Do not include IOB prefix on labels

-
-
Parameters
-
-
entitieslist

If defined, list of entities to ignore. The rest will be processed. Do not include IOB prefix on labels

-
-
-
-
-
- -
-
-setGreedyMode(p)[source]
-

Sets whether to ignore B tags for contiguous tokens of same entity same

-
-
Parameters
-
-
pbool

Whether to ignore B tags for contiguous tokens of same entity same

-
-
-
-
-
- -
-
-setInputCols(*value)
-

Sets column names of input annotations.

-
-
Parameters
-
-
*valuestr

Input columns for the annotator

-
-
-
-
-
- -
-
-setLazyAnnotator(value)
-

Sets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
-
Parameters
-
-
valuebool

Whether Annotator should be evaluated lazily in a -RecursivePipeline

-
-
-
-
-
- -
-
-setOutputCol(value)
-

Sets output column name of annotations.

-
-
Parameters
-
-
valuestr

Name of output column

-
-
-
-
-
- -
-
-setParamValue(paramName)
-

Sets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-setPreservePosition(p)[source]
-

Sets whether to preserve the original position of the tokens in the original document or use the modified tokens

-
-
Parameters
-
-
pbool

Whether to preserve the original position of the tokens in the original document or use the modified tokens

-
-
-
-
-
- -
-
-setReplaceDictResource(path, read_as='TEXT', options={'delimiter': ','})[source]
-

Sets replace dictionary pairs

-
-
Parameters
-
-
pathstr

Path to the external resource

-
-
read_asstr, optional

How to read the resource, by default ReadAs.TEXT

-
-
optionsdict, optional

Options for reading the resource, by default {“format”: “text”}

-
-
-
-
-
- -
-
-setThreshold(p)[source]
-

Sets confidence threshold to filter the chunk entities.

-
-
Parameters
-
-
pfloat

Confidence threshold to filter the chunk entities.

-
-
-
-
-
- -
-
-setWhiteList(entities)[source]
-

If defined, list of entities to process. The rest will be ignored. Do not include IOB prefix on labels

-
-
Parameters
-
-
entitieslist

If defined, list of entities to process. The rest will be ignored. Do not include IOB prefix on labels

-
-
-
-
-
- -
-
-transform(dataset, params=None)
-

Transforms the input dataset with optional parameters.

-
-

New in version 1.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset

-
-
paramsdict, optional

an optional param map that overrides embedded params.

-
-
-
-
Returns
-
-
pyspark.sql.DataFrame

transformed dataset

-
-
-
-
-
- -
-
-uid
-

A unique id for the object.

-
- -
-
-write()
-

Returns an MLWriter instance for this ML instance.

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.NerDisambiguator.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.NerDisambiguator.html deleted file mode 100644 index 50a0731017..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.NerDisambiguator.html +++ /dev/null @@ -1,1271 +0,0 @@ - - - - - - - - sparknlp_jsl.annotator.NerDisambiguator — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.annotator.NerDisambiguator

-
-
-class sparknlp_jsl.annotator.NerDisambiguator[source]
-

Bases: sparknlp.common.AnnotatorApproach

-

Links words of interest, such as names of persons, locations and companies, from an input text document to -a corresponding unique entity in a target Knowledge Base (KB). Words of interest are called Named Entities (NEs), -mentions, or surface forms. -Instantiated / pretrained model of the NerDisambiguator. -Links words of interest, such as names of persons, locations and companies, from an input text document to -a corresponding unique entity in a target Knowledge Base (KB). Words of interest are called Named Entities (NEs),

- ---- - - - - - - - - - - -

Input Annotation types

Output Annotation type

CHUNK, SENTENCE_EMBEDDINGS

DISAMBIGUATION

-
-
Parameters
-
-
embeddingTypeParam

Could be ‘bow’ for word embeddings or ‘sentence’ for sentences

-
-
numFirstChars

How many characters should be considered for initial prefix search in knowledge base

-
-
tokenSearch

Should we search by token or by chunk in knowledge base (token is recommended)

-
-
narrowWithApproximateMatching

Should we narrow prefix search results with levenstein distance based matching (true is recommended)

-
-
levenshteinDistanceThresholdParam

Levenshtein distance threshold to narrow results from prefix search (0.1 is default)

-
-
nearMatchingGapParam

Puts a limit on a string length (by trimming the candidate chunks) during levenshtein-distance based narrowing,len(candidate) - len(entity chunk) > nearMatchingGap (Default: 4).

-
-
predictionsLimit

Limit on amount of predictions N for topN predictions

-
-
s3KnowledgeBaseName

knowledge base name in s3

-
-
-
-
-

Examples

-
>>> data = spark.createDataFrame([["The show also had a contestant named Donald Trump who later defeated Christina Aguilera ..."]])     ...   .toDF("text")
->>> documentAssembler = DocumentAssembler() \
-...   .setInputCol("text") \
-...   .setOutputCol("document")
->>> sentenceDetector = SentenceDetector() \
-...   .setInputCols(["document"]) \
-...   .setOutputCol("sentence")
->>> tokenizer = Tokenizer() \
-...   .setInputCols(["sentence"]) \
-...   .setOutputCol("token")
->>> word_embeddings = WordEmbeddingsModel.pretrained() \
-...   .setInputCols(["sentence", "token"]) \
-...   .setOutputCol("embeddings")
->>> sentence_embeddings = SentenceEmbeddings() \
-...   .setInputCols(["sentence","embeddings"]) \
-...   .setOutputCol("sentence_embeddings")    >>> ner_model = NerDLModel.pretrained() \
-...   .setInputCols(["sentence", "token", "embeddings"]) \
-...   .setOutputCol("ner")
->>> ner_converter = NerConverter() \
-...   .setInputCols(["sentence", "token", "ner"]) \
-...   .setOutputCol("ner_chunk") \
-...   .setWhiteList(["PER"])
-
-
-

Then the extracted entities can be disambiguated. ->>> disambiguator = NerDisambiguator() … .setS3KnowledgeBaseName(“i-per”) … .setInputCols([“ner_chunk”, “sentence_embeddings”]) … .setOutputCol(“disambiguation”) … .setNumFirstChars(5) -… ->>> nlpPipeline = Pipeline(stages=[ -… documentAssembler, -… sentenceDetector, -… tokenizer, -… word_embeddings, -… sentence_embeddings, -… ner_model, -… ner_converter, -… disambiguator]) -… ->>> model = nlpPipeline.fit(data) ->>> result = model.transform(data) ->>> result.selectExpr(“explode(disambiguation)”) … .selectExpr(“col.metadata.chunk as chunk”, “col.result as result”).show(5, False)

- ---- - - - - - - - - -

chunk

result

Donald Trump -Christina Aguilera

http:#en.wikipedia.org/?curid=4848272, http:#en.wikipedia.org/?curid=31698421, http:#en.wikipedia.org/?curid=55907961 -http:#en.wikipedia.org/?curid=144171, http:#en.wikipedia.org/?curid=6636454

-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__()

clear(param)

Clears a param from the param map if it has been explicitly set.

copy([extra])

Creates a copy of this instance with the same uid and some extra params.

explainParam(param)

Explains a single param and returns its name, doc, and optional default value and user-supplied value in a string.

explainParams()

Returns the documentation of all params with their optionally default values and user-supplied values.

extractParamMap([extra])

Extracts the embedded default param values and user-supplied values, and then merges them with extra values from input into a flat param map, where the latter value is used if there exist conflicts, i.e., with ordering: default param values < user-supplied values < extra.

fit(dataset[, params])

Fits a model to the input dataset with optional parameters.

fitMultiple(dataset, paramMaps)

Fits a model to the input dataset for each param map in paramMaps.

getInputCols()

Gets current column names of input annotations.

getLazyAnnotator()

Gets whether Annotator should be evaluated lazily in a RecursivePipeline.

getOrDefault(param)

Gets the value of a param in the user-supplied param map or its default value.

getOutputCol()

Gets output column name of annotations.

getParam(paramName)

Gets a param by its name.

getParamValue(paramName)

Gets the value of a parameter.

hasDefault(param)

Checks whether a param has a default value.

hasParam(paramName)

Tests whether this instance contains a param with a given (string) name.

isDefined(param)

Checks whether a param is explicitly set by user or has a default value.

isSet(param)

Checks whether a param is explicitly set by user.

load(path)

Reads an ML instance from the input path, a shortcut of read().load(path).

read()

Returns an MLReader instance for this class.

save(path)

Save this ML instance to the given path, a shortcut of 'write().save(path)'.

set(param, value)

Sets a parameter in the embedded param map.

setEmbeddingType(value)

Sets if we want to use 'bow' for word embeddings or 'sentence' for sentences"

setInputCols(*value)

Sets column names of input annotations.

setLazyAnnotator(value)

Sets whether Annotator should be evaluated lazily in a RecursivePipeline.

setLevenshteinDistanceThresholdParam(value)

Sets Levenshtein distance threshold to narrow results from prefix search (0.1 is default)

setNarrowWithApproximateMatching(value)

Sets whether to narrow prefix search results with levenstein distance based matching (Default: true)

setNearMatchingGapParam(value)

Sets a limit on a string length (by trimming the candidate chunks) during levenshtein-distance based narrowing.

setNumFirstChars(value)

How many characters should be considered for initial prefix search in knowledge base

setOutputCol(value)

Sets output column name of annotations.

setParamValue(paramName)

Sets the value of a parameter.

setPredictionLimit(value)

Sets limit on amount of predictions N for topN predictions

setS3KnowledgeBaseName(value)

Sets knowledge base name in s3

setTokenSearch(value)

Sets whether to search by token or by chunk in knowledge base (Default: true)

write()

Returns an MLWriter instance for this ML instance.

-

Attributes

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

embeddingTypeParam

getter_attrs

inputCols

lazyAnnotator

levenshteinDistanceThresholdParam

narrowWithApproximateMatching

nearMatchingGapParam

numFirstChars

outputCol

params

Returns all params ordered by name.

predictionsLimit

s3KnowledgeBaseName

tokenSearch

-
-
-clear(param)
-

Clears a param from the param map if it has been explicitly set.

-
- -
-
-copy(extra=None)
-

Creates a copy of this instance with the same uid and some -extra params. This implementation first calls Params.copy and -then make a copy of the companion Java pipeline component with -extra params. So both the Python wrapper and the Java pipeline -component get copied.

-
-
Parameters
-
-
extradict, optional

Extra parameters to copy to the new instance

-
-
-
-
Returns
-
-
JavaParams

Copy of this instance

-
-
-
-
-
- -
-
-explainParam(param)
-

Explains a single param and returns its name, doc, and optional -default value and user-supplied value in a string.

-
- -
-
-explainParams()
-

Returns the documentation of all params with their optionally -default values and user-supplied values.

-
- -
-
-extractParamMap(extra=None)
-

Extracts the embedded default param values and user-supplied -values, and then merges them with extra values from input into -a flat param map, where the latter value is used if there exist -conflicts, i.e., with ordering: default param values < -user-supplied values < extra.

-
-
Parameters
-
-
extradict, optional

extra param values

-
-
-
-
Returns
-
-
dict

merged param map

-
-
-
-
-
- -
-
-fit(dataset, params=None)
-

Fits a model to the input dataset with optional parameters.

-
-

New in version 1.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset.

-
-
paramsdict or list or tuple, optional

an optional param map that overrides embedded params. If a list/tuple of -param maps is given, this calls fit on each param map and returns a list of -models.

-
-
-
-
Returns
-
-
Transformer or a list of Transformer

fitted model(s)

-
-
-
-
-
- -
-
-fitMultiple(dataset, paramMaps)
-

Fits a model to the input dataset for each param map in paramMaps.

-
-

New in version 2.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset.

-
-
paramMapscollections.abc.Sequence

A Sequence of param maps.

-
-
-
-
Returns
-
-
_FitMultipleIterator

A thread safe iterable which contains one model for each param map. Each -call to next(modelIterator) will return (index, model) where model was fit -using paramMaps[index]. index values may not be sequential.

-
-
-
-
-
- -
-
-getInputCols()
-

Gets current column names of input annotations.

-
- -
-
-getLazyAnnotator()
-

Gets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
- -
-
-getOrDefault(param)
-

Gets the value of a param in the user-supplied param map or its -default value. Raises an error if neither is set.

-
- -
-
-getOutputCol()
-

Gets output column name of annotations.

-
- -
-
-getParam(paramName)
-

Gets a param by its name.

-
- -
-
-getParamValue(paramName)
-

Gets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-hasDefault(param)
-

Checks whether a param has a default value.

-
- -
-
-hasParam(paramName)
-

Tests whether this instance contains a param with a given -(string) name.

-
- -
-
-isDefined(param)
-

Checks whether a param is explicitly set by user or has -a default value.

-
- -
-
-isSet(param)
-

Checks whether a param is explicitly set by user.

-
- -
-
-classmethod load(path)
-

Reads an ML instance from the input path, a shortcut of read().load(path).

-
- -
-
-property params
-

Returns all params ordered by name. The default implementation -uses dir() to get all attributes of type -Param.

-
- -
-
-classmethod read()
-

Returns an MLReader instance for this class.

-
- -
-
-save(path)
-

Save this ML instance to the given path, a shortcut of ‘write().save(path)’.

-
- -
-
-set(param, value)
-

Sets a parameter in the embedded param map.

-
- -
-
-setEmbeddingType(value)[source]
-

Sets if we want to use ‘bow’ for word embeddings or ‘sentence’ for sentences”

-
-
Parameters
-
-
valuestr

Can be ‘bow’ for word embeddings or ‘sentence’ for sentences (Default: sentence) -Can be ‘bow’ for word embeddings or ‘sentence’ for sentences (Default: sentence)

-
-
-
-
-
- -
-
-setInputCols(*value)
-

Sets column names of input annotations.

-
-
Parameters
-
-
*valuestr

Input columns for the annotator

-
-
-
-
-
- -
-
-setLazyAnnotator(value)
-

Sets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
-
Parameters
-
-
valuebool

Whether Annotator should be evaluated lazily in a -RecursivePipeline

-
-
-
-
-
- -
-
-setLevenshteinDistanceThresholdParam(value)[source]
-

Sets Levenshtein distance threshold to narrow results from prefix search (0.1 is default)

-
-
Parameters
-
-
valuefloat

Levenshtein distance threshold to narrow results from prefix search (0.1 is default)

-
-
-
-
-
- -
-
-setNarrowWithApproximateMatching(value)[source]
-

Sets whether to narrow prefix search results with levenstein distance based matching (Default: true)

-
-
Parameters
-
-
valuebool

Whether to narrow prefix search results with levenstein distance based matching (Default: true)

-
-
-
-
-
- -
-
-setNearMatchingGapParam(value)[source]
-

Sets a limit on a string length (by trimming the candidate chunks) during levenshtein-distance based narrowing.

-
-
Parameters
-
-
valueint

Limit on a string length (by trimming the candidate chunks) during levenshtein-distance based narrowing

-
-
-
-
-
- -
-
-setNumFirstChars(value)[source]
-

How many characters should be considered for initial prefix search in knowledge base

-
-
Parameters
-
-
valuebool

How many characters should be considered for initial prefix search in knowledge base

-
-
-
-
-
- -
-
-setOutputCol(value)
-

Sets output column name of annotations.

-
-
Parameters
-
-
valuestr

Name of output column

-
-
-
-
-
- -
-
-setParamValue(paramName)
-

Sets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-setPredictionLimit(value)[source]
-

Sets limit on amount of predictions N for topN predictions

-
-
Parameters
-
-
valuebool

Limit on amount of predictions N for topN predictions

-
-
-
-
-
- -
-
-setS3KnowledgeBaseName(value)[source]
-

Sets knowledge base name in s3

-
-
Parameters
-
-
valuestr

knowledge base name in s3 example (i-per)

-
-
-
-
-
- -
-
-setTokenSearch(value)[source]
-

Sets whether to search by token or by chunk in knowledge base (Default: true)

-
-
Parameters
-
-
valuebool

Whether to search by token or by chunk in knowledge base (Default: true)

-
-
-
-
-
- -
-
-uid
-

A unique id for the object.

-
- -
-
-write()
-

Returns an MLWriter instance for this ML instance.

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.NerDisambiguatorModel.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.NerDisambiguatorModel.html deleted file mode 100644 index 69a4b6fb6f..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.NerDisambiguatorModel.html +++ /dev/null @@ -1,1227 +0,0 @@ - - - - - - - - sparknlp_jsl.annotator.NerDisambiguatorModel — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.annotator.NerDisambiguatorModel

-
-
-class sparknlp_jsl.annotator.NerDisambiguatorModel(classname='com.johnsnowlabs.nlp.annotators.disambiguation.NerDisambiguatorModel', java_model=None)[source]
-

Bases: sparknlp.common.AnnotatorModel

-

Links words of interest, such as names of persons, locations and companies, from an input text document to -a corresponding unique entity in a target Knowledge Base (KB). Words of interest are called Named Entities (NEs), -mentions, or surface forms. -Instantiated / pretrained model of the NerDisambiguator. -Links words of interest, such as names of persons, locations and companies, from an input text document to -a corresponding unique entity in a target Knowledge Base (KB). Words of interest are called Named Entities (NEs),

- ---- - - - - - - - - - - -

Input Annotation types

Output Annotation type

CHUNK, SENTENCE_EMBEDDINGS

DISAMBIGUATION

-
-
Parameters
-
-
embeddingTypeParam

Could be bow for word embeddings or sentence for sentences

-
-
numFirstChars

How many characters should be considered for initial prefix search in knowledge base

-
-
tokenSearch

Should we search by token or by chunk in knowledge base (token is recommended)

-
-
narrowWithApproximateMatching

Should we narrow prefix search results with levenstein distance based matching (true is recommended)

-
-
levenshteinDistanceThresholdParam

Levenshtein distance threshold to narrow results from prefix search (0.1 is default)

-
-
nearMatchingGapParam

Puts a limit on a string length (by trimming the candidate chunks) during levenshtein-distance based narrowing,len(candidate) - len(entity chunk) > nearMatchingGap (Default: 4).

-
-
predictionsLimit

Limit on amount of predictions N for topN predictions

-
-
s3KnowledgeBaseName

knowledge base name in s3

-
-
-
-
-

Examples

-
>>> data = spark.createDataFrame([["The show also had a contestant named Donald Trump who later defeated Christina Aguilera ..."]])     ...   .toDF("text")
->>> documentAssembler = DocumentAssembler() \
-...   .setInputCol("text") \
-...   .setOutputCol("document")
->>> sentenceDetector = SentenceDetector() \
-...   .setInputCols(["document"]) \
-...   .setOutputCol("sentence")
->>> tokenizer = Tokenizer() \
-...   .setInputCols(["sentence"]) \
-...   .setOutputCol("token")
->>> word_embeddings = WordEmbeddingsModel.pretrained() \
-...   .setInputCols(["sentence", "token"]) \
-...   .setOutputCol("embeddings")
->>> sentence_embeddings = SentenceEmbeddings() \
-...   .setInputCols(["sentence","embeddings"]) \
-...   .setOutputCol("sentence_embeddings")    >>> ner_model = NerDLModel.pretrained() \
-...   .setInputCols(["sentence", "token", "embeddings"]) \
-...   .setOutputCol("ner")
->>> ner_converter = NerConverter() \
-...   .setInputCols(["sentence", "token", "ner"]) \
-...   .setOutputCol("ner_chunk") \
-...   .setWhiteList(["PER"])
-
-
-

Then the extracted entities can be disambiguated. ->>> disambiguator = NerDisambiguatorModel.pretrained() … .setInputCols([“ner_chunk”, “sentence_embeddings”]) … .setOutputCol(“disambiguation”) … .setNumFirstChars(5) -… ->>> nlpPipeline = Pipeline(stages=[ -… documentAssembler, -… sentenceDetector, -… tokenizer, -… word_embeddings, -… sentence_embeddings, -… ner_model, -… ner_converter, -… disambiguator]) -… ->>> model = nlpPipeline.fit(data) ->>> result = model.transform(data) ->>> result.selectExpr(“explode(disambiguation)”) … .selectExpr(“col.metadata.chunk as chunk”, “col.result as result”).show(5, False)

- ---- - - - - - - - - -

chunk

result

Donald Trump -Christina Aguilera

http:#en.wikipedia.org/?curid=4848272, http:#en.wikipedia.org/?curid=31698421, http:#en.wikipedia.org/?curid=55907961 -http:#en.wikipedia.org/?curid=144171, http:#en.wikipedia.org/?curid=6636454

-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__([classname, java_model])

Initialize this instance with a Java model object.

clear(param)

Clears a param from the param map if it has been explicitly set.

copy([extra])

Creates a copy of this instance with the same uid and some extra params.

explainParam(param)

Explains a single param and returns its name, doc, and optional default value and user-supplied value in a string.

explainParams()

Returns the documentation of all params with their optionally default values and user-supplied values.

extractParamMap([extra])

Extracts the embedded default param values and user-supplied values, and then merges them with extra values from input into a flat param map, where the latter value is used if there exist conflicts, i.e., with ordering: default param values < user-supplied values < extra.

getInputCols()

Gets current column names of input annotations.

getLazyAnnotator()

Gets whether Annotator should be evaluated lazily in a RecursivePipeline.

getOrDefault(param)

Gets the value of a param in the user-supplied param map or its default value.

getOutputCol()

Gets output column name of annotations.

getParam(paramName)

Gets a param by its name.

getParamValue(paramName)

Gets the value of a parameter.

hasDefault(param)

Checks whether a param has a default value.

hasParam(paramName)

Tests whether this instance contains a param with a given (string) name.

isDefined(param)

Checks whether a param is explicitly set by user or has a default value.

isSet(param)

Checks whether a param is explicitly set by user.

load(path)

Reads an ML instance from the input path, a shortcut of read().load(path).

pretrained([name, lang, remote_loc])

read()

Returns an MLReader instance for this class.

save(path)

Save this ML instance to the given path, a shortcut of 'write().save(path)'.

set(param, value)

Sets a parameter in the embedded param map.

setEmbeddingType(value)

Sets if we want to use 'bow' for word embeddings or 'sentence' for sentences

setInputCols(*value)

Sets column names of input annotations.

setLazyAnnotator(value)

Sets whether Annotator should be evaluated lazily in a RecursivePipeline.

setLevenshteinDistanceThresholdParam(value)

Sets Levenshtein distance threshold to narrow results from prefix search (0.1 is default)

setNarrowWithApproximateMatching(value)

Sets whether to narrow prefix search results with levenstein distance based matching (Default: true)

setNearMatchingGapParam(value)

Sets a limit on a string length (by trimming the candidate chunks) during levenshtein-distance based narrowing.

setNumFirstChars(value)

How many characters should be considered for initial prefix search in knowledge base

setOutputCol(value)

Sets output column name of annotations.

setParamValue(paramName)

Sets the value of a parameter.

setParams()

setPredictionLimit(value)

Sets limit on amount of predictions N for topN predictions

setTokenSearch(value)

Sets whether to search by token or by chunk in knowledge base (Default: true)

transform(dataset[, params])

Transforms the input dataset with optional parameters.

write()

Returns an MLWriter instance for this ML instance.

-

Attributes

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

embeddingTypeParam

getter_attrs

inputCols

lazyAnnotator

levenshteinDistanceThresholdParam

name

narrowWithApproximateMatching

nearMatchingGapParam

numFirstChars

outputCol

params

Returns all params ordered by name.

predictionsLimit

tokenSearch

-
-
-clear(param)
-

Clears a param from the param map if it has been explicitly set.

-
- -
-
-copy(extra=None)
-

Creates a copy of this instance with the same uid and some -extra params. This implementation first calls Params.copy and -then make a copy of the companion Java pipeline component with -extra params. So both the Python wrapper and the Java pipeline -component get copied.

-
-
Parameters
-
-
extradict, optional

Extra parameters to copy to the new instance

-
-
-
-
Returns
-
-
JavaParams

Copy of this instance

-
-
-
-
-
- -
-
-explainParam(param)
-

Explains a single param and returns its name, doc, and optional -default value and user-supplied value in a string.

-
- -
-
-explainParams()
-

Returns the documentation of all params with their optionally -default values and user-supplied values.

-
- -
-
-extractParamMap(extra=None)
-

Extracts the embedded default param values and user-supplied -values, and then merges them with extra values from input into -a flat param map, where the latter value is used if there exist -conflicts, i.e., with ordering: default param values < -user-supplied values < extra.

-
-
Parameters
-
-
extradict, optional

extra param values

-
-
-
-
Returns
-
-
dict

merged param map

-
-
-
-
-
- -
-
-getInputCols()
-

Gets current column names of input annotations.

-
- -
-
-getLazyAnnotator()
-

Gets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
- -
-
-getOrDefault(param)
-

Gets the value of a param in the user-supplied param map or its -default value. Raises an error if neither is set.

-
- -
-
-getOutputCol()
-

Gets output column name of annotations.

-
- -
-
-getParam(paramName)
-

Gets a param by its name.

-
- -
-
-getParamValue(paramName)
-

Gets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-hasDefault(param)
-

Checks whether a param has a default value.

-
- -
-
-hasParam(paramName)
-

Tests whether this instance contains a param with a given -(string) name.

-
- -
-
-isDefined(param)
-

Checks whether a param is explicitly set by user or has -a default value.

-
- -
-
-isSet(param)
-

Checks whether a param is explicitly set by user.

-
- -
-
-classmethod load(path)
-

Reads an ML instance from the input path, a shortcut of read().load(path).

-
- -
-
-property params
-

Returns all params ordered by name. The default implementation -uses dir() to get all attributes of type -Param.

-
- -
-
-classmethod read()
-

Returns an MLReader instance for this class.

-
- -
-
-save(path)
-

Save this ML instance to the given path, a shortcut of ‘write().save(path)’.

-
- -
-
-set(param, value)
-

Sets a parameter in the embedded param map.

-
- -
-
-setEmbeddingType(value)[source]
-

Sets if we want to use ‘bow’ for word embeddings or ‘sentence’ for sentences

-
-
Parameters
-
-
valuestr

Can be ‘bow’ for word embeddings or ‘sentence’ for sentences (Default: sentence)

-
-
-
-
-
- -
-
-setInputCols(*value)
-

Sets column names of input annotations.

-
-
Parameters
-
-
*valuestr

Input columns for the annotator

-
-
-
-
-
- -
-
-setLazyAnnotator(value)
-

Sets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
-
Parameters
-
-
valuebool

Whether Annotator should be evaluated lazily in a -RecursivePipeline

-
-
-
-
-
- -
-
-setLevenshteinDistanceThresholdParam(value)[source]
-

Sets Levenshtein distance threshold to narrow results from prefix search (0.1 is default)

-
-
Parameters
-
-
valuefloat

Levenshtein distance threshold to narrow results from prefix search (0.1 is default)

-
-
-
-
-
- -
-
-setNarrowWithApproximateMatching(value)[source]
-

Sets whether to narrow prefix search results with levenstein distance based matching (Default: true)

-
-
Parameters
-
-
valuebool

Whether to narrow prefix search results with levenstein distance based matching (Default: true)

-
-
-
-
-
- -
-
-setNearMatchingGapParam(value)[source]
-

Sets a limit on a string length (by trimming the candidate chunks) during levenshtein-distance based narrowing.

-
-
Parameters
-
-
valueint

Limit on a string length (by trimming the candidate chunks) during levenshtein-distance based narrowing

-
-
-
-
-
- -
-
-setNumFirstChars(value)[source]
-

How many characters should be considered for initial prefix search in knowledge base

-
-
Parameters
-
-
valuebool

How many characters should be considered for initial prefix search in knowledge base

-
-
-
-
-
- -
-
-setOutputCol(value)
-

Sets output column name of annotations.

-
-
Parameters
-
-
valuestr

Name of output column

-
-
-
-
-
- -
-
-setParamValue(paramName)
-

Sets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-setPredictionLimit(value)[source]
-

Sets limit on amount of predictions N for topN predictions

-
-
Parameters
-
-
sbool

Limit on amount of predictions N for topN predictions

-
-
-
-
-
- -
-
-setTokenSearch(value)[source]
-

Sets whether to search by token or by chunk in knowledge base (Default: true)

-
-
Parameters
-
-
valuebool

Whether to search by token or by chunk in knowledge base (Default: true)

-
-
-
-
-
- -
-
-transform(dataset, params=None)
-

Transforms the input dataset with optional parameters.

-
-

New in version 1.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset

-
-
paramsdict, optional

an optional param map that overrides embedded params.

-
-
-
-
Returns
-
-
pyspark.sql.DataFrame

transformed dataset

-
-
-
-
-
- -
-
-uid
-

A unique id for the object.

-
- -
-
-write()
-

Returns an MLWriter instance for this ML instance.

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.PosologyREModel.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.PosologyREModel.html deleted file mode 100644 index 9e6f781e45..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.PosologyREModel.html +++ /dev/null @@ -1,1078 +0,0 @@ - - - - - - - - sparknlp_jsl.annotator.PosologyREModel — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.annotator.PosologyREModel

-
-
-class sparknlp_jsl.annotator.PosologyREModel(classname='com.johnsnowlabs.nlp.annotators.re.PosologyREModel', java_model=None)[source]
-

Bases: sparknlp_jsl.annotator.RelationExtractionModel

-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__([classname, java_model])

Initialize this instance with a Java model object.

clear(param)

Clears a param from the param map if it has been explicitly set.

copy([extra])

Creates a copy of this instance with the same uid and some extra params.

explainParam(param)

Explains a single param and returns its name, doc, and optional default value and user-supplied value in a string.

explainParams()

Returns the documentation of all params with their optionally default values and user-supplied values.

extractParamMap([extra])

Extracts the embedded default param values and user-supplied values, and then merges them with extra values from input into a flat param map, where the latter value is used if there exist conflicts, i.e., with ordering: default param values < user-supplied values < extra.

getClasses()

Returns labels used to train this model

getInputCols()

Gets current column names of input annotations.

getLazyAnnotator()

Gets whether Annotator should be evaluated lazily in a RecursivePipeline.

getOrDefault(param)

Gets the value of a param in the user-supplied param map or its default value.

getOutputCol()

Gets output column name of annotations.

getParam(paramName)

Gets a param by its name.

getParamValue(paramName)

Gets the value of a parameter.

hasDefault(param)

Checks whether a param has a default value.

hasParam(paramName)

Tests whether this instance contains a param with a given (string) name.

isDefined(param)

Checks whether a param is explicitly set by user or has a default value.

isSet(param)

Checks whether a param is explicitly set by user.

load(path)

Reads an ML instance from the input path, a shortcut of read().load(path).

pretrained(name[, lang, remote_loc])

read()

Returns an MLReader instance for this class.

save(path)

Save this ML instance to the given path, a shortcut of 'write().save(path)'.

set(param, value)

Sets a parameter in the embedded param map.

setCustomLabels(labels)

Sets custom relation labels

setInputCols(*value)

Sets column names of input annotations.

setLazyAnnotator(value)

Sets whether Annotator should be evaluated lazily in a RecursivePipeline.

setMaxSyntacticDistance(distance)

Sets maximal syntactic distance, as threshold (Default: 0)

setOutputCol(value)

Sets output column name of annotations.

setParamValue(paramName)

Sets the value of a parameter.

setParams()

setPredictionThreshold(threshold)

Sets Minimal activation of the target unit to encode a new relation instance

setRelationPairs(pairs)

Sets List of dash-separated pairs of named entities ("ENTITY1-ENTITY2", e.g.

transform(dataset[, params])

Transforms the input dataset with optional parameters.

write()

Returns an MLWriter instance for this ML instance.

-

Attributes

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

classes

customLabels

getter_attrs

inputCols

lazyAnnotator

maxSyntacticDistance

name

outputCol

params

Returns all params ordered by name.

predictionThreshold

relationPairs

-
-
-clear(param)
-

Clears a param from the param map if it has been explicitly set.

-
- -
-
-copy(extra=None)
-

Creates a copy of this instance with the same uid and some -extra params. This implementation first calls Params.copy and -then make a copy of the companion Java pipeline component with -extra params. So both the Python wrapper and the Java pipeline -component get copied.

-
-
Parameters
-
-
extradict, optional

Extra parameters to copy to the new instance

-
-
-
-
Returns
-
-
JavaParams

Copy of this instance

-
-
-
-
-
- -
-
-explainParam(param)
-

Explains a single param and returns its name, doc, and optional -default value and user-supplied value in a string.

-
- -
-
-explainParams()
-

Returns the documentation of all params with their optionally -default values and user-supplied values.

-
- -
-
-extractParamMap(extra=None)
-

Extracts the embedded default param values and user-supplied -values, and then merges them with extra values from input into -a flat param map, where the latter value is used if there exist -conflicts, i.e., with ordering: default param values < -user-supplied values < extra.

-
-
Parameters
-
-
extradict, optional

extra param values

-
-
-
-
Returns
-
-
dict

merged param map

-
-
-
-
-
- -
-
-getClasses()
-

Returns labels used to train this model

-
- -
-
-getInputCols()
-

Gets current column names of input annotations.

-
- -
-
-getLazyAnnotator()
-

Gets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
- -
-
-getOrDefault(param)
-

Gets the value of a param in the user-supplied param map or its -default value. Raises an error if neither is set.

-
- -
-
-getOutputCol()
-

Gets output column name of annotations.

-
- -
-
-getParam(paramName)
-

Gets a param by its name.

-
- -
-
-getParamValue(paramName)
-

Gets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-hasDefault(param)
-

Checks whether a param has a default value.

-
- -
-
-hasParam(paramName)
-

Tests whether this instance contains a param with a given -(string) name.

-
- -
-
-isDefined(param)
-

Checks whether a param is explicitly set by user or has -a default value.

-
- -
-
-isSet(param)
-

Checks whether a param is explicitly set by user.

-
- -
-
-classmethod load(path)
-

Reads an ML instance from the input path, a shortcut of read().load(path).

-
- -
-
-property params
-

Returns all params ordered by name. The default implementation -uses dir() to get all attributes of type -Param.

-
- -
-
-classmethod read()
-

Returns an MLReader instance for this class.

-
- -
-
-save(path)
-

Save this ML instance to the given path, a shortcut of ‘write().save(path)’.

-
- -
-
-set(param, value)
-

Sets a parameter in the embedded param map.

-
- -
-
-setCustomLabels(labels)
-

Sets custom relation labels

-
-
Parameters
-
-
labelsdict[str, str]

Dictionary which maps old to new labels

-
-
-
-
-
- -
-
-setInputCols(*value)
-

Sets column names of input annotations.

-
-
Parameters
-
-
*valuestr

Input columns for the annotator

-
-
-
-
-
- -
-
-setLazyAnnotator(value)
-

Sets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
-
Parameters
-
-
valuebool

Whether Annotator should be evaluated lazily in a -RecursivePipeline

-
-
-
-
-
- -
-
-setMaxSyntacticDistance(distance)
-

Sets maximal syntactic distance, as threshold (Default: 0)

-
-
Parameters
-
-
bint

Maximal syntactic distance, as threshold (Default: 0)

-
-
-
-
-
- -
-
-setOutputCol(value)
-

Sets output column name of annotations.

-
-
Parameters
-
-
valuestr

Name of output column

-
-
-
-
-
- -
-
-setParamValue(paramName)
-

Sets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-setPredictionThreshold(threshold)
-

Sets Minimal activation of the target unit to encode a new relation instance

-
-
Parameters
-
-
thresholdfloat

Minimal activation of the target unit to encode a new relation instance

-
-
-
-
-
- -
-
-setRelationPairs(pairs)
-

Sets List of dash-separated pairs of named entities (“ENTITY1-ENTITY2”, e.g. “Biomarker-RelativeDay”), which will be processed

-
-
Parameters
-
-
pairsstr

List of dash-separated pairs of named entities (“ENTITY1-ENTITY2”, e.g. “Biomarker-RelativeDay”), which will be processed

-
-
-
-
-
- -
-
-transform(dataset, params=None)
-

Transforms the input dataset with optional parameters.

-
-

New in version 1.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset

-
-
paramsdict, optional

an optional param map that overrides embedded params.

-
-
-
-
Returns
-
-
pyspark.sql.DataFrame

transformed dataset

-
-
-
-
-
- -
-
-uid
-

A unique id for the object.

-
- -
-
-write()
-

Returns an MLWriter instance for this ML instance.

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.RENerChunksFilter.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.RENerChunksFilter.html deleted file mode 100644 index 0dbe84de12..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.RENerChunksFilter.html +++ /dev/null @@ -1,1145 +0,0 @@ - - - - - - - - sparknlp_jsl.annotator.RENerChunksFilter — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.annotator.RENerChunksFilter

-
-
-class sparknlp_jsl.annotator.RENerChunksFilter(classname='com.johnsnowlabs.nlp.annotators.re.RENerChunksFilter', java_model=None)[source]
-

Bases: sparknlp.common.AnnotatorModel

-

Filters and outputs combinations of relations between extracted entities, for further processing. -This annotator is especially useful to create inputs for the RelationExtractionDLModel.

- ---- - - - - - - - - - - -

Input Annotation types

Output Annotation type

CHUNK,DEPENDENCY

CHUNK

-
-
Parameters
-
-
relationPairs

List of valid relations to encode

-
-
maxSyntacticDistance

Maximum syntactic distance between a pair of named entities to consider them as a relation

-
-
docLevelRelations

Include relations between entities from different sentences (Default: False)

-
-
-
-
-

Examples

-
>>> documenter = DocumentAssembler()\
-...   .setInputCol("text")\
-...   .setOutputCol("document")
-...
->>> sentencer = SentenceDetector()\
-...   .setInputCols(["document"])\
-...   .setOutputCol("sentences")
-...
->>> tokenizer = Tokenizer()\
-...   .setInputCols(["sentences"])\
-...   .setOutputCol("tokens")
-...
->>> words_embedder = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
-...   .setInputCols(["sentences", "tokens"])\
-...   .setOutputCol("embeddings")
-...
->>> pos_tagger = PerceptronModel.pretrained("pos_clinical", "en", "clinical/models")\
-...   .setInputCols(["sentences", "tokens"])\
-...   .setOutputCol("pos_tags")
-...
->>> dependency_parser = DependencyParserModel.pretrained("dependency_conllu", "en")\
-...   .setInputCols(["sentences", "pos_tags", "tokens"])\
-...   .setOutputCol("dependencies")
-...
->>> clinical_ner_tagger = MedicalNerModel.pretrained("jsl_ner_wip_greedy_clinical","en","clinical/models")\
-...   .setInputCols(["sentences", "tokens", "embeddings"])\
-...   .setOutputCol("ner_tags")
-...
->>> ner_chunker = NerConverter()\
-...   .setInputCols(["sentences", "tokens", "ner_tags"])\
-...   .setOutputCol("ner_chunks")
-...
-... # Define the relation pairs and the filter
->>> relationPairs = [
-...   "direction-external_body_part_or_region",
-...   "external_body_part_or_region-direction",
-...   "direction-internal_organ_or_component",
-...   "internal_organ_or_component-direction"
-... ]
-...
->>> re_ner_chunk_filter = RENerChunksFilter()\
-...   .setInputCols(["ner_chunks", "dependencies"])\
-...   .setOutputCol("re_ner_chunks")\
-...   .setMaxSyntacticDistance(4)\
-...   .setRelationPairs(["internal_organ_or_component-direction"])
-...
->>> trained_pipeline = Pipeline(stages=[
-...   documenter,
-...   sentencer,
-...   tokenizer,
-...   words_embedder,
-...   pos_tagger,
-...   clinical_ner_tagger,
-...   ner_chunker,
-...   dependency_parser,
-...   re_ner_chunk_filter
-... ])
-...
->>> data = spark.cre>>>DataFrame([["MRI demonstrated infarction in the upper brain stem , left cerebellum and  right basil ganglia"]]).toDF("text")
->>> result = trained_pipeline.fit(data).transform(data)
-...
-... # Show results
->>> result.selectExpr("explode(re_ner_chunks) as re_chunks")     ...   .selectExpr("re_chunks.begin", "re_chunks.result", "re_chunks.metadata.entity", "re_chunks.metadata.paired_to")     ...   .show(6, truncate=False)
-+-----+-------------+---------------------------+---------+
-|begin|result       |entity                     |paired_to|
-+-----+-------------+---------------------------+---------+
-|35   |upper        |Direction                  |41       |
-|41   |brain stem   |Internal_organ_or_component|35       |
-|35   |upper        |Direction                  |59       |
-|59   |cerebellum   |Internal_organ_or_component|35       |
-|35   |upper        |Direction                  |81       |
-|81   |basil ganglia|Internal_organ_or_component|35       |
-+-----+-------------+---------------------------+---------+
-+---------------------------------------------------------+
-
-
-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__([classname, java_model])

Initialize this instance with a Java model object.

clear(param)

Clears a param from the param map if it has been explicitly set.

copy([extra])

Creates a copy of this instance with the same uid and some extra params.

explainParam(param)

Explains a single param and returns its name, doc, and optional default value and user-supplied value in a string.

explainParams()

Returns the documentation of all params with their optionally default values and user-supplied values.

extractParamMap([extra])

Extracts the embedded default param values and user-supplied values, and then merges them with extra values from input into a flat param map, where the latter value is used if there exist conflicts, i.e., with ordering: default param values < user-supplied values < extra.

getInputCols()

Gets current column names of input annotations.

getLazyAnnotator()

Gets whether Annotator should be evaluated lazily in a RecursivePipeline.

getOrDefault(param)

Gets the value of a param in the user-supplied param map or its default value.

getOutputCol()

Gets output column name of annotations.

getParam(paramName)

Gets a param by its name.

getParamValue(paramName)

Gets the value of a parameter.

hasDefault(param)

Checks whether a param has a default value.

hasParam(paramName)

Tests whether this instance contains a param with a given (string) name.

isDefined(param)

Checks whether a param is explicitly set by user or has a default value.

isSet(param)

Checks whether a param is explicitly set by user.

load(path)

Reads an ML instance from the input path, a shortcut of read().load(path).

read()

Returns an MLReader instance for this class.

save(path)

Save this ML instance to the given path, a shortcut of 'write().save(path)'.

set(param, value)

Sets a parameter in the embedded param map.

setDocLevelRelations(docLevelRelations)

Sets whether to include relations between entities from different sentences

setInputCols(*value)

Sets column names of input annotations.

setLazyAnnotator(value)

Sets whether Annotator should be evaluated lazily in a RecursivePipeline.

setMaxSyntacticDistance(distance)

Sets maximum syntactic distance between a pair of named entities to consider them as a relation"

setOutputCol(value)

Sets output column name of annotations.

setParamValue(paramName)

Sets the value of a parameter.

setParams()

setRelationPairs(pairs)

Sets list of dash-separated pairs of named entities

transform(dataset[, params])

Transforms the input dataset with optional parameters.

write()

Returns an MLWriter instance for this ML instance.

-

Attributes

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

docLevelRelations

getter_attrs

inputCols

lazyAnnotator

maxSyntacticDistance

name

outputCol

params

Returns all params ordered by name.

relationPairs

-
-
-clear(param)
-

Clears a param from the param map if it has been explicitly set.

-
- -
-
-copy(extra=None)
-

Creates a copy of this instance with the same uid and some -extra params. This implementation first calls Params.copy and -then make a copy of the companion Java pipeline component with -extra params. So both the Python wrapper and the Java pipeline -component get copied.

-
-
Parameters
-
-
extradict, optional

Extra parameters to copy to the new instance

-
-
-
-
Returns
-
-
JavaParams

Copy of this instance

-
-
-
-
-
- -
-
-explainParam(param)
-

Explains a single param and returns its name, doc, and optional -default value and user-supplied value in a string.

-
- -
-
-explainParams()
-

Returns the documentation of all params with their optionally -default values and user-supplied values.

-
- -
-
-extractParamMap(extra=None)
-

Extracts the embedded default param values and user-supplied -values, and then merges them with extra values from input into -a flat param map, where the latter value is used if there exist -conflicts, i.e., with ordering: default param values < -user-supplied values < extra.

-
-
Parameters
-
-
extradict, optional

extra param values

-
-
-
-
Returns
-
-
dict

merged param map

-
-
-
-
-
- -
-
-getInputCols()
-

Gets current column names of input annotations.

-
- -
-
-getLazyAnnotator()
-

Gets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
- -
-
-getOrDefault(param)
-

Gets the value of a param in the user-supplied param map or its -default value. Raises an error if neither is set.

-
- -
-
-getOutputCol()
-

Gets output column name of annotations.

-
- -
-
-getParam(paramName)
-

Gets a param by its name.

-
- -
-
-getParamValue(paramName)
-

Gets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-hasDefault(param)
-

Checks whether a param has a default value.

-
- -
-
-hasParam(paramName)
-

Tests whether this instance contains a param with a given -(string) name.

-
- -
-
-isDefined(param)
-

Checks whether a param is explicitly set by user or has -a default value.

-
- -
-
-isSet(param)
-

Checks whether a param is explicitly set by user.

-
- -
-
-classmethod load(path)
-

Reads an ML instance from the input path, a shortcut of read().load(path).

-
- -
-
-property params
-

Returns all params ordered by name. The default implementation -uses dir() to get all attributes of type -Param.

-
- -
-
-classmethod read()
-

Returns an MLReader instance for this class.

-
- -
-
-save(path)
-

Save this ML instance to the given path, a shortcut of ‘write().save(path)’.

-
- -
-
-set(param, value)
-

Sets a parameter in the embedded param map.

-
- -
-
-setDocLevelRelations(docLevelRelations)[source]
-

Sets whether to include relations between entities from different sentences

-
-
Parameters
-
-
docLevelRelationsbool

Whether to include relations between entities from different sentences

-
-
-
-
-
- -
-
-setInputCols(*value)
-

Sets column names of input annotations.

-
-
Parameters
-
-
*valuestr

Input columns for the annotator

-
-
-
-
-
- -
-
-setLazyAnnotator(value)
-

Sets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
-
Parameters
-
-
valuebool

Whether Annotator should be evaluated lazily in a -RecursivePipeline

-
-
-
-
-
- -
-
-setMaxSyntacticDistance(distance)[source]
-

Sets maximum syntactic distance between a pair of named entities to consider them as a relation”

-
-
Parameters
-
-
distanceint

Maximum syntactic distance between a pair of named entities to consider them as a relation

-
-
-
-
-
- -
-
-setOutputCol(value)
-

Sets output column name of annotations.

-
-
Parameters
-
-
valuestr

Name of output column

-
-
-
-
-
- -
-
-setParamValue(paramName)
-

Sets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-setRelationPairs(pairs)[source]
-

Sets list of dash-separated pairs of named entities

-
-
Parameters
-
-
pairsstr

List of dash-separated pairs of named entities

-
-
-
-
-
- -
-
-transform(dataset, params=None)
-

Transforms the input dataset with optional parameters.

-
-

New in version 1.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset

-
-
paramsdict, optional

an optional param map that overrides embedded params.

-
-
-
-
Returns
-
-
pyspark.sql.DataFrame

transformed dataset

-
-
-
-
-
- -
-
-uid
-

A unique id for the object.

-
- -
-
-write()
-

Returns an MLWriter instance for this ML instance.

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.ReIdentification.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.ReIdentification.html deleted file mode 100644 index 1c5c95e14c..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.ReIdentification.html +++ /dev/null @@ -1,978 +0,0 @@ - - - - - - - - sparknlp_jsl.annotator.ReIdentification — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.annotator.ReIdentification

-
-
-class sparknlp_jsl.annotator.ReIdentification(classname='com.johnsnowlabs.nlp.annotators.deid.ReIdentification', java_model=None)[source]
-

Bases: sparknlp.common.AnnotatorModel

-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__([classname, java_model])

Initialize this instance with a Java model object.

clear(param)

Clears a param from the param map if it has been explicitly set.

copy([extra])

Creates a copy of this instance with the same uid and some extra params.

explainParam(param)

Explains a single param and returns its name, doc, and optional default value and user-supplied value in a string.

explainParams()

Returns the documentation of all params with their optionally default values and user-supplied values.

extractParamMap([extra])

Extracts the embedded default param values and user-supplied values, and then merges them with extra values from input into a flat param map, where the latter value is used if there exist conflicts, i.e., with ordering: default param values < user-supplied values < extra.

getInputCols()

Gets current column names of input annotations.

getLazyAnnotator()

Gets whether Annotator should be evaluated lazily in a RecursivePipeline.

getOrDefault(param)

Gets the value of a param in the user-supplied param map or its default value.

getOutputCol()

Gets output column name of annotations.

getParam(paramName)

Gets a param by its name.

getParamValue(paramName)

Gets the value of a parameter.

hasDefault(param)

Checks whether a param has a default value.

hasParam(paramName)

Tests whether this instance contains a param with a given (string) name.

isDefined(param)

Checks whether a param is explicitly set by user or has a default value.

isSet(param)

Checks whether a param is explicitly set by user.

load(path)

Reads an ML instance from the input path, a shortcut of read().load(path).

read()

Returns an MLReader instance for this class.

save(path)

Save this ML instance to the given path, a shortcut of 'write().save(path)'.

set(param, value)

Sets a parameter in the embedded param map.

setInputCols(*value)

Sets column names of input annotations.

setLazyAnnotator(value)

Sets whether Annotator should be evaluated lazily in a RecursivePipeline.

setOutputCol(value)

Sets output column name of annotations.

setParamValue(paramName)

Sets the value of a parameter.

setParams()

transform(dataset[, params])

Transforms the input dataset with optional parameters.

write()

Returns an MLWriter instance for this ML instance.

-

Attributes

- ---- - - - - - - - - - - - - - - - - - - - - -

getter_attrs

inputCols

lazyAnnotator

name

outputCol

params

Returns all params ordered by name.

-
-
-clear(param)
-

Clears a param from the param map if it has been explicitly set.

-
- -
-
-copy(extra=None)
-

Creates a copy of this instance with the same uid and some -extra params. This implementation first calls Params.copy and -then make a copy of the companion Java pipeline component with -extra params. So both the Python wrapper and the Java pipeline -component get copied.

-
-
Parameters
-
-
extradict, optional

Extra parameters to copy to the new instance

-
-
-
-
Returns
-
-
JavaParams

Copy of this instance

-
-
-
-
-
- -
-
-explainParam(param)
-

Explains a single param and returns its name, doc, and optional -default value and user-supplied value in a string.

-
- -
-
-explainParams()
-

Returns the documentation of all params with their optionally -default values and user-supplied values.

-
- -
-
-extractParamMap(extra=None)
-

Extracts the embedded default param values and user-supplied -values, and then merges them with extra values from input into -a flat param map, where the latter value is used if there exist -conflicts, i.e., with ordering: default param values < -user-supplied values < extra.

-
-
Parameters
-
-
extradict, optional

extra param values

-
-
-
-
Returns
-
-
dict

merged param map

-
-
-
-
-
- -
-
-getInputCols()
-

Gets current column names of input annotations.

-
- -
-
-getLazyAnnotator()
-

Gets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
- -
-
-getOrDefault(param)
-

Gets the value of a param in the user-supplied param map or its -default value. Raises an error if neither is set.

-
- -
-
-getOutputCol()
-

Gets output column name of annotations.

-
- -
-
-getParam(paramName)
-

Gets a param by its name.

-
- -
-
-getParamValue(paramName)
-

Gets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-hasDefault(param)
-

Checks whether a param has a default value.

-
- -
-
-hasParam(paramName)
-

Tests whether this instance contains a param with a given -(string) name.

-
- -
-
-isDefined(param)
-

Checks whether a param is explicitly set by user or has -a default value.

-
- -
-
-isSet(param)
-

Checks whether a param is explicitly set by user.

-
- -
-
-classmethod load(path)
-

Reads an ML instance from the input path, a shortcut of read().load(path).

-
- -
-
-property params
-

Returns all params ordered by name. The default implementation -uses dir() to get all attributes of type -Param.

-
- -
-
-classmethod read()
-

Returns an MLReader instance for this class.

-
- -
-
-save(path)
-

Save this ML instance to the given path, a shortcut of ‘write().save(path)’.

-
- -
-
-set(param, value)
-

Sets a parameter in the embedded param map.

-
- -
-
-setInputCols(*value)
-

Sets column names of input annotations.

-
-
Parameters
-
-
*valuestr

Input columns for the annotator

-
-
-
-
-
- -
-
-setLazyAnnotator(value)
-

Sets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
-
Parameters
-
-
valuebool

Whether Annotator should be evaluated lazily in a -RecursivePipeline

-
-
-
-
-
- -
-
-setOutputCol(value)
-

Sets output column name of annotations.

-
-
Parameters
-
-
valuestr

Name of output column

-
-
-
-
-
- -
-
-setParamValue(paramName)
-

Sets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-transform(dataset, params=None)
-

Transforms the input dataset with optional parameters.

-
-

New in version 1.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset

-
-
paramsdict, optional

an optional param map that overrides embedded params.

-
-
-
-
Returns
-
-
pyspark.sql.DataFrame

transformed dataset

-
-
-
-
-
- -
-
-uid
-

A unique id for the object.

-
- -
-
-write()
-

Returns an MLWriter instance for this ML instance.

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.RelationExtractionApproach.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.RelationExtractionApproach.html deleted file mode 100644 index 51125d5cb6..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.RelationExtractionApproach.html +++ /dev/null @@ -1,1364 +0,0 @@ - - - - - - - - sparknlp_jsl.annotator.RelationExtractionApproach — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.annotator.RelationExtractionApproach

-
-
-class sparknlp_jsl.annotator.RelationExtractionApproach(classname='com.johnsnowlabs.nlp.annotators.re.RelationExtractionApproach')[source]
-

Bases: sparknlp_jsl.annotator.GenericClassifierApproach

-

Trains a TensorFlow model for relation extraction. The Tensorflow graph in .pb format needs to be specified with -setModelFile. The result is a RelationExtractionModel. -To start training, see the parameters that need to be set in the Parameters section.

- ---- - - - - - - - - - - -

Input Annotation types

Output Annotation type

WORD_EMBEDDINGS, POS, CHUNK, DEPENDENCY

CATEGORY

-
-
Parameters
-
-
fromEntityBeginCol

From Entity Begining Column

-
-
fromEntityEndCol

From Entity End Column

-
-
fromEntityLabelCol

From Entity Label Column

-
-
toEntityBeginCol

To Entity Begining Column

-
-
toEntityEndCol

To Entity End Column

-
-
toEntityLabelCol

“To Entity Label Column

-
-
-
-
-

Examples

-
>>> import sparknlp
->>> from sparknlp.base import *
->>> from sparknlp.common import *
->>> from sparknlp.annotator import *
->>> from sparknlp.training import *
->>> import sparknlp_jsl
->>> from sparknlp_jsl.base import *
->>> from sparknlp_jsl.annotator import *
->>> from pyspark.ml import Pipeline
->>> documentAssembler = DocumentAssembler()     ...   .setInputCol("text")     ...   .setOutputCol("document")
-...
->>> tokenizer = Tokenizer()     ...   .setInputCols(["document"])     ...   .setOutputCol("tokens")
-...
->>> embedder = WordEmbeddingsModel     ...   .pretrained("embeddings_clinical", "en", "clinical/models")     ...   .setInputCols(["document", "tokens"])     ...   .setOutputCol("embeddings")
-...
->>> posTagger = PerceptronModel     ...   .pretrained("pos_clinical", "en", "clinical/models")     ...   .setInputCols(["document", "tokens"])     ...   .setOutputCol("posTags")
-...
->>> nerTagger = MedicalNerModel     ...   .pretrained("ner_events_clinical", "en", "clinical/models")     ...   .setInputCols(["document", "tokens", "embeddings"])     ...   .setOutputCol("ner_tags")
-...
->>> nerConverter = NerConverter()     ...   .setInputCols(["document", "tokens", "ner_tags"])     ...   .setOutputCol("nerChunks")
-...
->>> depencyParser = DependencyParserModel     ...   .pretrained("dependency_conllu", "en")     ...   .setInputCols(["document", "posTags", "tokens"])     ...   .setOutputCol("dependencies")
-...
->>> re = RelationExtractionApproach()     ...   .setInputCols(["embeddings", "posTags", "train_ner_chunks", "dependencies"])     ...   .setOutputCol("relations_t")     ...   .setLabelColumn("target_rel")     ...   .setEpochsNumber(300)     ...   .setBatchSize(200)     ...   .setLearningRate(0.001)     ...   .setModelFile("path/to/graph_file.pb")     ...   .setFixImbalance(True)     ...   .setValidationSplit(0.05)     ...   .setFromEntity("from_begin", "from_end", "from_label")     ...   .setToEntity("to_begin", "to_end", "to_label")
-...
->>> pipeline = Pipeline(stages=[
-...     documentAssembler,
-...     tokenizer,
-...     embedder,
-...     posTagger,
-...     nerTagger,
-...     nerConverter,
-...     depencyParser,
-...     re])
-
-
-
>>> model = pipeline.fit(trainData)
-
-
-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__([classname])

clear(param)

Clears a param from the param map if it has been explicitly set.

copy([extra])

Creates a copy of this instance with the same uid and some extra params.

explainParam(param)

Explains a single param and returns its name, doc, and optional default value and user-supplied value in a string.

explainParams()

Returns the documentation of all params with their optionally default values and user-supplied values.

extractParamMap([extra])

Extracts the embedded default param values and user-supplied values, and then merges them with extra values from input into a flat param map, where the latter value is used if there exist conflicts, i.e., with ordering: default param values < user-supplied values < extra.

fit(dataset[, params])

Fits a model to the input dataset with optional parameters.

fitMultiple(dataset, paramMaps)

Fits a model to the input dataset for each param map in paramMaps.

getInputCols()

Gets current column names of input annotations.

getLazyAnnotator()

Gets whether Annotator should be evaluated lazily in a RecursivePipeline.

getOrDefault(param)

Gets the value of a param in the user-supplied param map or its default value.

getOutputCol()

Gets output column name of annotations.

getParam(paramName)

Gets a param by its name.

getParamValue(paramName)

Gets the value of a parameter.

hasDefault(param)

Checks whether a param has a default value.

hasParam(paramName)

Tests whether this instance contains a param with a given (string) name.

isDefined(param)

Checks whether a param is explicitly set by user or has a default value.

isSet(param)

Checks whether a param is explicitly set by user.

load(path)

Reads an ML instance from the input path, a shortcut of read().load(path).

read()

Returns an MLReader instance for this class.

save(path)

Save this ML instance to the given path, a shortcut of 'write().save(path)'.

set(param, value)

Sets a parameter in the embedded param map.

setBatchSize(size)

Size for each batch in the optimization process

setCustomLabels(labels)

Sets custom relation labels

setDropout(dropout)

Sets drouptup

setEpochsNumber(epochs)

Sets number of epochs for the optimization process

setFeatureScaling(feature_scaling)

Sets Feature scaling method.

setFixImbalance(fix_imbalance)

Sets A flag indicating whenther to balance the trainig set.

setFromEntity(begin_col, end_col, label_col)

Sets from entity

setInputCols(*value)

Sets column names of input annotations.

setLabelCol(label_column)

Sets Size for each batch in the optimization process

setLazyAnnotator(value)

Sets whether Annotator should be evaluated lazily in a RecursivePipeline.

setLearningRate(lamda)

Sets learning rate for the optimization process

setModelFile(mode_file)

Sets file name to load the mode from"

setOutputCol(value)

Sets output column name of annotations.

setOutputLogsPath(output_logs_path)

Sets path to folder where logs will be saved.

setParamValue(paramName)

Sets the value of a parameter.

setToEntity(begin_col, end_col, label_col)

Sets to entity

setValidationSplit(validation_split)

Sets validaiton split - how much data to use for validation

write()

Returns an MLWriter instance for this ML instance.

-

Attributes

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

batchSize

customLabels

dropout

epochsN

featureScaling

fixImbalance

fromEntityBeginCol

fromEntityEndCol

fromEntityLabelCol

getter_attrs

inputCols

labelColumn

lazyAnnotator

learningRate

modelFile

name

outputCol

outputLogsPath

params

Returns all params ordered by name.

toEntityBeginCol

toEntityEndCol

toEntityLabelCol

validationSplit

-
-
-clear(param)
-

Clears a param from the param map if it has been explicitly set.

-
- -
-
-copy(extra=None)
-

Creates a copy of this instance with the same uid and some -extra params. This implementation first calls Params.copy and -then make a copy of the companion Java pipeline component with -extra params. So both the Python wrapper and the Java pipeline -component get copied.

-
-
Parameters
-
-
extradict, optional

Extra parameters to copy to the new instance

-
-
-
-
Returns
-
-
JavaParams

Copy of this instance

-
-
-
-
-
- -
-
-explainParam(param)
-

Explains a single param and returns its name, doc, and optional -default value and user-supplied value in a string.

-
- -
-
-explainParams()
-

Returns the documentation of all params with their optionally -default values and user-supplied values.

-
- -
-
-extractParamMap(extra=None)
-

Extracts the embedded default param values and user-supplied -values, and then merges them with extra values from input into -a flat param map, where the latter value is used if there exist -conflicts, i.e., with ordering: default param values < -user-supplied values < extra.

-
-
Parameters
-
-
extradict, optional

extra param values

-
-
-
-
Returns
-
-
dict

merged param map

-
-
-
-
-
- -
-
-fit(dataset, params=None)
-

Fits a model to the input dataset with optional parameters.

-
-

New in version 1.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset.

-
-
paramsdict or list or tuple, optional

an optional param map that overrides embedded params. If a list/tuple of -param maps is given, this calls fit on each param map and returns a list of -models.

-
-
-
-
Returns
-
-
Transformer or a list of Transformer

fitted model(s)

-
-
-
-
-
- -
-
-fitMultiple(dataset, paramMaps)
-

Fits a model to the input dataset for each param map in paramMaps.

-
-

New in version 2.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset.

-
-
paramMapscollections.abc.Sequence

A Sequence of param maps.

-
-
-
-
Returns
-
-
_FitMultipleIterator

A thread safe iterable which contains one model for each param map. Each -call to next(modelIterator) will return (index, model) where model was fit -using paramMaps[index]. index values may not be sequential.

-
-
-
-
-
- -
-
-getInputCols()
-

Gets current column names of input annotations.

-
- -
-
-getLazyAnnotator()
-

Gets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
- -
-
-getOrDefault(param)
-

Gets the value of a param in the user-supplied param map or its -default value. Raises an error if neither is set.

-
- -
-
-getOutputCol()
-

Gets output column name of annotations.

-
- -
-
-getParam(paramName)
-

Gets a param by its name.

-
- -
-
-getParamValue(paramName)
-

Gets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-hasDefault(param)
-

Checks whether a param has a default value.

-
- -
-
-hasParam(paramName)
-

Tests whether this instance contains a param with a given -(string) name.

-
- -
-
-isDefined(param)
-

Checks whether a param is explicitly set by user or has -a default value.

-
- -
-
-isSet(param)
-

Checks whether a param is explicitly set by user.

-
- -
-
-classmethod load(path)
-

Reads an ML instance from the input path, a shortcut of read().load(path).

-
- -
-
-property params
-

Returns all params ordered by name. The default implementation -uses dir() to get all attributes of type -Param.

-
- -
-
-classmethod read()
-

Returns an MLReader instance for this class.

-
- -
-
-save(path)
-

Save this ML instance to the given path, a shortcut of ‘write().save(path)’.

-
- -
-
-set(param, value)
-

Sets a parameter in the embedded param map.

-
- -
-
-setBatchSize(size)
-

Size for each batch in the optimization process

-
-
Parameters
-
-
sizeint

Size for each batch in the optimization process

-
-
-
-
-
- -
-
-setCustomLabels(labels)[source]
-

Sets custom relation labels

-
-
Parameters
-
-
labelsdict[str, str]

Dictionary which maps old to new labels

-
-
-
-
-
- -
-
-setDropout(dropout)
-

Sets drouptup

-
-
Parameters
-
-
dropoutfloat

Dropout at the output of each layer

-
-
-
-
-
- -
-
-setEpochsNumber(epochs)
-

Sets number of epochs for the optimization process

-
-
Parameters
-
-
epochsint

Number of epochs for the optimization process

-
-
-
-
-
- -
-
-setFeatureScaling(feature_scaling)
-

Sets Feature scaling method. Possible values are ‘zscore’, ‘minmax’ or empty (no scaling

-
-
Parameters
-
-
feature_scalingstr

Feature scaling method. Possible values are ‘zscore’, ‘minmax’ or empty (no scaling

-
-
-
-
-
- -
-
-setFixImbalance(fix_imbalance)
-

Sets A flag indicating whenther to balance the trainig set.

-
-
Parameters
-
-
fix_imbalancebool

A flag indicating whenther to balance the trainig set.

-
-
-
-
-
- -
-
-setFromEntity(begin_col, end_col, label_col)[source]
-

Sets from entity

-
-
Parameters
-
-
begin_colstr

Column that has a reference of where the chunk begins

-
-
end_col: str

Column that has a reference of where the chunk end

-
-
label_col: str

Column that has a reference what are the type of chunk

-
-
-
-
-
- -
-
-setInputCols(*value)
-

Sets column names of input annotations.

-
-
Parameters
-
-
*valuestr

Input columns for the annotator

-
-
-
-
-
- -
-
-setLabelCol(label_column)
-

Sets Size for each batch in the optimization process

-
-
Parameters
-
-
labelstr

Column with the value result we are trying to predict.

-
-
-
-
-
- -
-
-setLazyAnnotator(value)
-

Sets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
-
Parameters
-
-
valuebool

Whether Annotator should be evaluated lazily in a -RecursivePipeline

-
-
-
-
-
- -
-
-setLearningRate(lamda)
-

Sets learning rate for the optimization process

-
-
Parameters
-
-
lamdafloat

Learning rate for the optimization process

-
-
-
-
-
- -
-
-setModelFile(mode_file)
-

Sets file name to load the mode from”

-
-
Parameters
-
-
labelstr

File name to load the mode from”

-
-
-
-
-
- -
-
-setOutputCol(value)
-

Sets output column name of annotations.

-
-
Parameters
-
-
valuestr

Name of output column

-
-
-
-
-
- -
-
-setOutputLogsPath(output_logs_path)
-

Sets path to folder where logs will be saved. If no path is specified, no logs are generated

-
-
Parameters
-
-
labelstr

Path to folder where logs will be saved. If no path is specified, no logs are generated

-
-
-
-
-
- -
-
-setParamValue(paramName)
-

Sets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-setToEntity(begin_col, end_col, label_col)[source]
-

Sets to entity

-
-
Parameters
-
-
begin_colstr

Column that has a reference of where the chunk begins

-
-
end_col: str

Column that has a reference of where the chunk end

-
-
label_col: str

Column that has a reference what are the type of chunk

-
-
-
-
-
- -
-
-setValidationSplit(validation_split)
-

Sets validaiton split - how much data to use for validation

-
-
Parameters
-
-
validation_splitfloat

Validaiton split - how much data to use for validation

-
-
-
-
-
- -
-
-uid
-

A unique id for the object.

-
- -
-
-write()
-

Returns an MLWriter instance for this ML instance.

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.RelationExtractionDLModel.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.RelationExtractionDLModel.html deleted file mode 100644 index 0b39a2c6bb..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.RelationExtractionDLModel.html +++ /dev/null @@ -1,1191 +0,0 @@ - - - - - - - - sparknlp_jsl.annotator.RelationExtractionDLModel — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.annotator.RelationExtractionDLModel

-
-
-class sparknlp_jsl.annotator.RelationExtractionDLModel(classname='com.johnsnowlabs.nlp.annotators.re.RelationExtractionDLModel', java_model=None)[source]
-

Bases: sparknlp.common.AnnotatorModel

-

Extracts and classifies instances of relations between named entities. -In contrast with RelationExtractionModel, RelationExtractionDLModel is based on BERT. -For pretrained models please see the

- ---- - - - - - - - - - - -

Input Annotation types

Output Annotation type

CHUNK, DOCUMENT

CATEGORY

-
-
Parameters
-
-
predictionThreshold

Minimal activation of the target unit to encode a new relation instance

-
-
batchSize

Number of relations to process at once

-
-
-
-
-

Examples

-
>>> import sparknlp
->>> from sparknlp.base import *
->>> from sparknlp.common import *
->>> from sparknlp.annotator import *
->>> from sparknlp.training import *
->>> import sparknlp_jsl
->>> from sparknlp_jsl.base import *
->>> from sparknlp_jsl.annotator import *
->>> from pyspark.ml import Pipeline
->>> documentAssembler = DocumentAssembler() \
-...   .setInputCol("text") \
-...   .setOutputCol("document")
-...
->>> tokenizer = Tokenizer() \
-...   .setInputCols(["document"]) \
-...   .setOutputCol("tokens")
-...
->>> embedder = WordEmbeddingsModel     ...   .pretrained("embeddings_clinical", "en", "clinical/models") \
-...   .setInputCols(["document", "tokens"]) \
-...   .setOutputCol("embeddings")
-...
->>> posTagger = PerceptronModel \
-...   .pretrained("pos_clinical", "en", "clinical/models") \
-...   .setInputCols(["document", "tokens"]) \
-...   .setOutputCol("posTags")
-...
->>> nerTagger = MedicalNerModel \
-...   .pretrained("ner_events_clinical", "en", "clinical/models") \
-...   .setInputCols(["document", "tokens", "embeddings"]) \
-...   .setOutputCol("ner_tags")
-...
->>> nerConverter = NerConverter() \
-...   .setInputCols(["document", "tokens", "ner_tags"]) \
-...   .setOutputCol("nerChunks")
-...
->>> depencyParser = DependencyParserModel \
-...   .pretrained("dependency_conllu", "en") \
-...   .setInputCols(["document", "posTags", "tokens"]) \
-...   .setOutputCol("dependencies")
-...
->>> relationPairs = [
-...   "direction-external_body_part_or_region",
-...   "external_body_part_or_region-direction",
-...   "direction-internal_organ_or_component",
-...   "internal_organ_or_component-direction"
-... ]
->>> re_ner_chunk_filter = RENerChunksFilter()\
-...   .setInputCols(["ner_chunks", "dependencies"])\
-...   .setOutputCol("re_ner_chunks")\
-...   .setMaxSyntacticDistance(4)\
-...   .setRelationPairs(["internal_organ_or_component-direction"])
-...
->>> re_model = RelationExtractionDLModel.pretrained("redl_bodypart_direction_biobert", "en", "clinical/models") \
-...     .setInputCols(["re_ner_chunks", "sentences"]) \
-...     .setOutputCol("relations") \
-...     .setPredictionThreshold(0.5)
-...
->>> pipeline = Pipeline(stages=[
-...     documentAssembler,
-...     tokenizer,
-...     embedder,
-...     posTagger,
-...     nerTagger,
-...     nerConverter,
-...     depencyParser,
-...     re_ner_chunk_filter ,
-...     re_model])
-
-
-
>>> model = pipeline.fit(trainData)
->>> data = spark.createDataFrame([["MRI demonstrated infarction in the upper brain stem , left cerebellum and  right basil ganglia"]]).toDF("text")
->>> result = pipeline.fit(data).transform(data)
-...
->>> result.selectExpr("explode(relations) as relations")
-...  .select(
-...    "relations.metadata.chunk1",
-...    "relations.metadata.entity1",
-...    "relations.metadata.chunk2",
-...    "relations.metadata.entity2",
-...    "relations.result"
-...  )
-...  .where("result != 0")
-...  .show(truncate=False)
-...
-... # Show results
-... result.selectExpr("explode(relations) as relations") \
-...   .select(
-...      "relations.metadata.chunk1",
-...      "relations.metadata.entity1",
-...      "relations.metadata.chunk2",
-...      "relations.metadata.entity2",
-...      "relations.result"
-...   ).where("result != 0")     ...   .show(truncate=False)
-+------+---------+-------------+---------------------------+------+
-|chunk1|entity1  |chunk2       |entity2                    |result|
-+------+---------+-------------+---------------------------+------+
-|upper |Direction|brain stem   |Internal_organ_or_component|1     |
-|left  |Direction|cerebellum   |Internal_organ_or_component|1     |
-|right |Direction|basil ganglia|Internal_organ_or_component|1     |
-+------+---------+-------------+---------------------------+------+
-
-
-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__([classname, java_model])

Initialize this instance with a Java model object.

clear(param)

Clears a param from the param map if it has been explicitly set.

copy([extra])

Creates a copy of this instance with the same uid and some extra params.

explainParam(param)

Explains a single param and returns its name, doc, and optional default value and user-supplied value in a string.

explainParams()

Returns the documentation of all params with their optionally default values and user-supplied values.

extractParamMap([extra])

Extracts the embedded default param values and user-supplied values, and then merges them with extra values from input into a flat param map, where the latter value is used if there exist conflicts, i.e., with ordering: default param values < user-supplied values < extra.

getClasses()

Returns labels used to train this model

getInputCols()

Gets current column names of input annotations.

getLazyAnnotator()

Gets whether Annotator should be evaluated lazily in a RecursivePipeline.

getOrDefault(param)

Gets the value of a param in the user-supplied param map or its default value.

getOutputCol()

Gets output column name of annotations.

getParam(paramName)

Gets a param by its name.

getParamValue(paramName)

Gets the value of a parameter.

hasDefault(param)

Checks whether a param has a default value.

hasParam(paramName)

Tests whether this instance contains a param with a given (string) name.

isDefined(param)

Checks whether a param is explicitly set by user or has a default value.

isSet(param)

Checks whether a param is explicitly set by user.

load(path)

Reads an ML instance from the input path, a shortcut of read().load(path).

loadSavedModel(folder, spark_session)

pretrained(name[, lang, remote_loc])

read()

Returns an MLReader instance for this class.

save(path)

Save this ML instance to the given path, a shortcut of 'write().save(path)'.

set(param, value)

Sets a parameter in the embedded param map.

setBatchSize(value)

Sets number of relations to process at once

setCaseSensitive(value)

setCustomLabels(labels)

Sets custom relation labels

setInputCols(*value)

Sets column names of input annotations.

setLazyAnnotator(value)

Sets whether Annotator should be evaluated lazily in a RecursivePipeline.

setOutputCol(value)

Sets output column name of annotations.

setParamValue(paramName)

Sets the value of a parameter.

setParams()

setPredictionThreshold(threshold)

Sets maximum syntactic distance between a pair of named entities to consider them as a relation

transform(dataset[, params])

Transforms the input dataset with optional parameters.

write()

Returns an MLWriter instance for this ML instance.

-

Attributes

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

batchSize

classes

customLabels

getter_attrs

inputCols

lazyAnnotator

name

outputCol

params

Returns all params ordered by name.

predictionThreshold

-
-
-clear(param)
-

Clears a param from the param map if it has been explicitly set.

-
- -
-
-copy(extra=None)
-

Creates a copy of this instance with the same uid and some -extra params. This implementation first calls Params.copy and -then make a copy of the companion Java pipeline component with -extra params. So both the Python wrapper and the Java pipeline -component get copied.

-
-
Parameters
-
-
extradict, optional

Extra parameters to copy to the new instance

-
-
-
-
Returns
-
-
JavaParams

Copy of this instance

-
-
-
-
-
- -
-
-explainParam(param)
-

Explains a single param and returns its name, doc, and optional -default value and user-supplied value in a string.

-
- -
-
-explainParams()
-

Returns the documentation of all params with their optionally -default values and user-supplied values.

-
- -
-
-extractParamMap(extra=None)
-

Extracts the embedded default param values and user-supplied -values, and then merges them with extra values from input into -a flat param map, where the latter value is used if there exist -conflicts, i.e., with ordering: default param values < -user-supplied values < extra.

-
-
Parameters
-
-
extradict, optional

extra param values

-
-
-
-
Returns
-
-
dict

merged param map

-
-
-
-
-
- -
-
-getClasses()[source]
-

Returns labels used to train this model

-
- -
-
-getInputCols()
-

Gets current column names of input annotations.

-
- -
-
-getLazyAnnotator()
-

Gets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
- -
-
-getOrDefault(param)
-

Gets the value of a param in the user-supplied param map or its -default value. Raises an error if neither is set.

-
- -
-
-getOutputCol()
-

Gets output column name of annotations.

-
- -
-
-getParam(paramName)
-

Gets a param by its name.

-
- -
-
-getParamValue(paramName)
-

Gets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-hasDefault(param)
-

Checks whether a param has a default value.

-
- -
-
-hasParam(paramName)
-

Tests whether this instance contains a param with a given -(string) name.

-
- -
-
-isDefined(param)
-

Checks whether a param is explicitly set by user or has -a default value.

-
- -
-
-isSet(param)
-

Checks whether a param is explicitly set by user.

-
- -
-
-classmethod load(path)
-

Reads an ML instance from the input path, a shortcut of read().load(path).

-
- -
-
-property params
-

Returns all params ordered by name. The default implementation -uses dir() to get all attributes of type -Param.

-
- -
-
-classmethod read()
-

Returns an MLReader instance for this class.

-
- -
-
-save(path)
-

Save this ML instance to the given path, a shortcut of ‘write().save(path)’.

-
- -
-
-set(param, value)
-

Sets a parameter in the embedded param map.

-
- -
-
-setBatchSize(value)[source]
-

Sets number of relations to process at once

-
-
Parameters
-
-
valueint

Number of relations to process at once

-
-
-
-
-
- -
-
-setCustomLabels(labels)[source]
-

Sets custom relation labels

-
-
Parameters
-
-
labelsdict[str, str]

Dictionary which maps old to new labels

-
-
-
-
-
- -
-
-setInputCols(*value)
-

Sets column names of input annotations.

-
-
Parameters
-
-
*valuestr

Input columns for the annotator

-
-
-
-
-
- -
-
-setLazyAnnotator(value)
-

Sets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
-
Parameters
-
-
valuebool

Whether Annotator should be evaluated lazily in a -RecursivePipeline

-
-
-
-
-
- -
-
-setOutputCol(value)
-

Sets output column name of annotations.

-
-
Parameters
-
-
valuestr

Name of output column

-
-
-
-
-
- -
-
-setParamValue(paramName)
-

Sets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-setPredictionThreshold(threshold)[source]
-

Sets maximum syntactic distance between a pair of named entities to consider them as a relation

-
-
Parameters
-
-
thresholdfloat

maximum syntactic distance between a pair of named entities to consider them as a relation

-
-
-
-
-
- -
-
-transform(dataset, params=None)
-

Transforms the input dataset with optional parameters.

-
-

New in version 1.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset

-
-
paramsdict, optional

an optional param map that overrides embedded params.

-
-
-
-
Returns
-
-
pyspark.sql.DataFrame

transformed dataset

-
-
-
-
-
- -
-
-uid
-

A unique id for the object.

-
- -
-
-write()
-

Returns an MLWriter instance for this ML instance.

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.RelationExtractionModel.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.RelationExtractionModel.html deleted file mode 100644 index 2d69c03f8a..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.RelationExtractionModel.html +++ /dev/null @@ -1,1203 +0,0 @@ - - - - - - - - sparknlp_jsl.annotator.RelationExtractionModel — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.annotator.RelationExtractionModel

-
-
-class sparknlp_jsl.annotator.RelationExtractionModel(classname='com.johnsnowlabs.nlp.annotators.re.RelationExtractionModel', java_model=None)[source]
-

Bases: sparknlp.common.AnnotatorModel

-

Trains a TensorFlow model for relation extraction. The Tensorflow graph in .pb format needs to be specified with -setModelFile. The result is a RelationExtractionModel. -To start training, see the parameters that need to be set in the Parameters section.

- ---- - - - - - - - - - - -

Input Annotation types

Output Annotation type

WORD_EMBEDDINGS, POS, CHUNK, DEPENDENCY

CATEGORY

-
-
Parameters
-
-
predictionThreshold

Minimal activation of the target unit to encode a new relation instance

-
-
relationPairs

List of dash-separated pairs of named entities (“ENTITY1-ENTITY2”, e.g. “Biomarker-RelativeDay”), which will be processed

-
-
maxSyntacticDistance

Maximal syntactic distance, as threshold (Default: 0)

-
-
-
-
-

Examples

-
>>> import sparknlp
->>> from sparknlp.base import *
->>> from sparknlp.common import *
->>> from sparknlp.annotator import *
->>> from sparknlp.training import *
->>> import sparknlp_jsl
->>> from sparknlp_jsl.base import *
->>> from sparknlp_jsl.annotator import *
->>> from pyspark.ml import Pipeline
->>> documentAssembler = DocumentAssembler() \
-...   .setInputCol("text") \
-...   .setOutputCol("document")
-...
->>> tokenizer = Tokenizer() \
-...   .setInputCols(["document"]) \
-...   .setOutputCol("tokens")
-...
->>> embedder = WordEmbeddingsModel     ...   .pretrained("embeddings_clinical", "en", "clinical/models") \
-...   .setInputCols(["document", "tokens"]) \
-...   .setOutputCol("embeddings")
-...
->>> posTagger = PerceptronModel \
-...   .pretrained("pos_clinical", "en", "clinical/models") \
-...   .setInputCols(["document", "tokens"]) \
-...   .setOutputCol("posTags")
-...
->>> nerTagger = MedicalNerModel \
-...   .pretrained("ner_events_clinical", "en", "clinical/models") \
-...   .setInputCols(["document", "tokens", "embeddings"]) \
-...   .setOutputCol("ner_tags")
-...
->>> nerConverter = NerConverter() \
-...   .setInputCols(["document", "tokens", "ner_tags"]) \
-...   .setOutputCol("nerChunks")
-...
->>> depencyParser = DependencyParserModel \
-...   .pretrained("dependency_conllu", "en") \
-...   .setInputCols(["document", "posTags", "tokens"]) \
-...   .setOutputCol("dependencies")
-...
->>> relationPairs = [
-...   "direction-external_body_part_or_region",
-...   "external_body_part_or_region-direction",
-...   "direction-internal_organ_or_component",
-...   "internal_organ_or_component-direction"
-... ]
-...
->>> re_model = RelationExtractionModel.pretrained("re_bodypart_directions", "en", "clinical/models") \
-...     .setInputCols(["embeddings", "pos_tags", "ner_chunks", "dependencies"]) \
-...     .setOutputCol("relations") \
-...     .setRelationPairs(relationPairs) \
-...     .setMaxSyntacticDistance(4) \
-...     .setPredictionThreshold(0.9)
-...
->>> pipeline = Pipeline(stages=[
-...     documentAssembler,
-...     tokenizer,
-...     embedder,
-...     posTagger,
-...     nerTagger,
-...     nerConverter,
-...     depencyParser,
-...     re_model])
-
-
-
>>> model = pipeline.fit(trainData)
->>> data = spark.createDataFrame([["MRI demonstrated infarction in the upper brain stem , left cerebellum and  right basil ganglia"]]).toDF("text")
->>> result = pipeline.fit(data).transform(data)
-...
->>> result.selectExpr("explode(relations) as relations")
-...  .select(
-...    "relations.metadata.chunk1",
-...    "relations.metadata.entity1",
-...    "relations.metadata.chunk2",
-...    "relations.metadata.entity2",
-...    "relations.result"
-...  )
-...  .where("result != 0")
-...  .show(truncate=False)
-...
-... # Show results
-... result.selectExpr("explode(relations) as relations") \
-...   .select(
-...      "relations.metadata.chunk1",
-...      "relations.metadata.entity1",
-...      "relations.metadata.chunk2",
-...      "relations.metadata.entity2",
-...      "relations.result"
-...   ).where("result != 0")     ...   .show(truncate=False)
-+------+---------+-------------+---------------------------+------+
-|chunk1|entity1  |chunk2       |entity2                    |result|
-+------+---------+-------------+---------------------------+------+
-|upper |Direction|brain stem   |Internal_organ_or_component|1     |
-|left  |Direction|cerebellum   |Internal_organ_or_component|1     |
-|right |Direction|basil ganglia|Internal_organ_or_component|1     |
-+------+---------+-------------+---------------------------+------+
-
-
-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__([classname, java_model])

Initialize this instance with a Java model object.

clear(param)

Clears a param from the param map if it has been explicitly set.

copy([extra])

Creates a copy of this instance with the same uid and some extra params.

explainParam(param)

Explains a single param and returns its name, doc, and optional default value and user-supplied value in a string.

explainParams()

Returns the documentation of all params with their optionally default values and user-supplied values.

extractParamMap([extra])

Extracts the embedded default param values and user-supplied values, and then merges them with extra values from input into a flat param map, where the latter value is used if there exist conflicts, i.e., with ordering: default param values < user-supplied values < extra.

getClasses()

Returns labels used to train this model

getInputCols()

Gets current column names of input annotations.

getLazyAnnotator()

Gets whether Annotator should be evaluated lazily in a RecursivePipeline.

getOrDefault(param)

Gets the value of a param in the user-supplied param map or its default value.

getOutputCol()

Gets output column name of annotations.

getParam(paramName)

Gets a param by its name.

getParamValue(paramName)

Gets the value of a parameter.

hasDefault(param)

Checks whether a param has a default value.

hasParam(paramName)

Tests whether this instance contains a param with a given (string) name.

isDefined(param)

Checks whether a param is explicitly set by user or has a default value.

isSet(param)

Checks whether a param is explicitly set by user.

load(path)

Reads an ML instance from the input path, a shortcut of read().load(path).

pretrained(name[, lang, remote_loc])

read()

Returns an MLReader instance for this class.

save(path)

Save this ML instance to the given path, a shortcut of 'write().save(path)'.

set(param, value)

Sets a parameter in the embedded param map.

setCustomLabels(labels)

Sets custom relation labels

setInputCols(*value)

Sets column names of input annotations.

setLazyAnnotator(value)

Sets whether Annotator should be evaluated lazily in a RecursivePipeline.

setMaxSyntacticDistance(distance)

Sets maximal syntactic distance, as threshold (Default: 0)

setOutputCol(value)

Sets output column name of annotations.

setParamValue(paramName)

Sets the value of a parameter.

setParams()

setPredictionThreshold(threshold)

Sets Minimal activation of the target unit to encode a new relation instance

setRelationPairs(pairs)

Sets List of dash-separated pairs of named entities ("ENTITY1-ENTITY2", e.g.

transform(dataset[, params])

Transforms the input dataset with optional parameters.

write()

Returns an MLWriter instance for this ML instance.

-

Attributes

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

classes

customLabels

getter_attrs

inputCols

lazyAnnotator

maxSyntacticDistance

name

outputCol

params

Returns all params ordered by name.

predictionThreshold

relationPairs

-
-
-clear(param)
-

Clears a param from the param map if it has been explicitly set.

-
- -
-
-copy(extra=None)
-

Creates a copy of this instance with the same uid and some -extra params. This implementation first calls Params.copy and -then make a copy of the companion Java pipeline component with -extra params. So both the Python wrapper and the Java pipeline -component get copied.

-
-
Parameters
-
-
extradict, optional

Extra parameters to copy to the new instance

-
-
-
-
Returns
-
-
JavaParams

Copy of this instance

-
-
-
-
-
- -
-
-explainParam(param)
-

Explains a single param and returns its name, doc, and optional -default value and user-supplied value in a string.

-
- -
-
-explainParams()
-

Returns the documentation of all params with their optionally -default values and user-supplied values.

-
- -
-
-extractParamMap(extra=None)
-

Extracts the embedded default param values and user-supplied -values, and then merges them with extra values from input into -a flat param map, where the latter value is used if there exist -conflicts, i.e., with ordering: default param values < -user-supplied values < extra.

-
-
Parameters
-
-
extradict, optional

extra param values

-
-
-
-
Returns
-
-
dict

merged param map

-
-
-
-
-
- -
-
-getClasses()[source]
-

Returns labels used to train this model

-
- -
-
-getInputCols()
-

Gets current column names of input annotations.

-
- -
-
-getLazyAnnotator()
-

Gets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
- -
-
-getOrDefault(param)
-

Gets the value of a param in the user-supplied param map or its -default value. Raises an error if neither is set.

-
- -
-
-getOutputCol()
-

Gets output column name of annotations.

-
- -
-
-getParam(paramName)
-

Gets a param by its name.

-
- -
-
-getParamValue(paramName)
-

Gets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-hasDefault(param)
-

Checks whether a param has a default value.

-
- -
-
-hasParam(paramName)
-

Tests whether this instance contains a param with a given -(string) name.

-
- -
-
-isDefined(param)
-

Checks whether a param is explicitly set by user or has -a default value.

-
- -
-
-isSet(param)
-

Checks whether a param is explicitly set by user.

-
- -
-
-classmethod load(path)
-

Reads an ML instance from the input path, a shortcut of read().load(path).

-
- -
-
-property params
-

Returns all params ordered by name. The default implementation -uses dir() to get all attributes of type -Param.

-
- -
-
-classmethod read()
-

Returns an MLReader instance for this class.

-
- -
-
-save(path)
-

Save this ML instance to the given path, a shortcut of ‘write().save(path)’.

-
- -
-
-set(param, value)
-

Sets a parameter in the embedded param map.

-
- -
-
-setCustomLabels(labels)[source]
-

Sets custom relation labels

-
-
Parameters
-
-
labelsdict[str, str]

Dictionary which maps old to new labels

-
-
-
-
-
- -
-
-setInputCols(*value)
-

Sets column names of input annotations.

-
-
Parameters
-
-
*valuestr

Input columns for the annotator

-
-
-
-
-
- -
-
-setLazyAnnotator(value)
-

Sets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
-
Parameters
-
-
valuebool

Whether Annotator should be evaluated lazily in a -RecursivePipeline

-
-
-
-
-
- -
-
-setMaxSyntacticDistance(distance)[source]
-

Sets maximal syntactic distance, as threshold (Default: 0)

-
-
Parameters
-
-
bint

Maximal syntactic distance, as threshold (Default: 0)

-
-
-
-
-
- -
-
-setOutputCol(value)
-

Sets output column name of annotations.

-
-
Parameters
-
-
valuestr

Name of output column

-
-
-
-
-
- -
-
-setParamValue(paramName)
-

Sets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-setPredictionThreshold(threshold)[source]
-

Sets Minimal activation of the target unit to encode a new relation instance

-
-
Parameters
-
-
thresholdfloat

Minimal activation of the target unit to encode a new relation instance

-
-
-
-
-
- -
-
-setRelationPairs(pairs)[source]
-

Sets List of dash-separated pairs of named entities (“ENTITY1-ENTITY2”, e.g. “Biomarker-RelativeDay”), which will be processed

-
-
Parameters
-
-
pairsstr

List of dash-separated pairs of named entities (“ENTITY1-ENTITY2”, e.g. “Biomarker-RelativeDay”), which will be processed

-
-
-
-
-
- -
-
-transform(dataset, params=None)
-

Transforms the input dataset with optional parameters.

-
-

New in version 1.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset

-
-
paramsdict, optional

an optional param map that overrides embedded params.

-
-
-
-
Returns
-
-
pyspark.sql.DataFrame

transformed dataset

-
-
-
-
-
- -
-
-uid
-

A unique id for the object.

-
- -
-
-write()
-

Returns an MLWriter instance for this ML instance.

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.Router.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.Router.html deleted file mode 100644 index 6c1b1392f0..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.Router.html +++ /dev/null @@ -1,1105 +0,0 @@ - - - - - - - - sparknlp_jsl.annotator.Router — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.annotator.Router

-
-
-class sparknlp_jsl.annotator.Router(classname='com.johnsnowlabs.nlp.Router', java_model=None)[source]
-

Bases: sparknlp.common.AnnotatorModel

-

Convert chunks from regexMatcher to chunks with a entity in the metadata. -Use the identifier or field as a entity.

- ---- - - - - - - - - - - -

Input Annotation types

Output Annotation type

ANY

ANY

-
-
Parameters
-
-
inputType

The type of the entity that you want to filter by default sentence_embeddings.Possible values -document|token|wordpiece|word_embeddings|sentence_embeddings|category|date|sentiment|pos|chunk|named_entity|regex|dependency|labeled_dependency|language|keyword

-
-
filterFieldsElements

The filterfieldsElements are the allowed values for the metadata field that is being used

-
-
metadataField

The key in the metadata dictionary that you want to filter (by default entity)

-
-
-
-
-

Examples

-
>>> test_data = spark.createDataFrame(sentences).toDF("text")
->>> document = DocumentAssembler().setInputCol("text").setOutputCol("document")
->>> sentence = SentenceDetector().setInputCols("document").setOutputCol("sentence")
->>> regexMatcher = RegexMatcher().setExternalRules("../src/test/resources/regex-matcher/rules2.txt", ",") \
-...     .setInputCols("sentence") \
-...     .setOutputCol("regex") \
-...     .setStrategy("MATCH_ALL")
->>> chunk2Doc = Chunk2Doc().setInputCols("regex").setOutputCol("doc_chunk")
->>> embeddings = BertSentenceEmbeddings.pretrained("sent_small_bert_L2_128") \
-...     .setInputCols("doc_chunk") \
-...     .setOutputCol("bert") \
-...     .setCaseSensitive(False) \
-...     .setMaxSentenceLength(32)
->>> router_name_embeddings = Router() \
-...     .setInputType("sentence_embeddings") \
-...     .setInputCols("bert") \
-...     .setMetadataField("identifier") \
-...     .setFilterFieldsElements(["name"]) \
-...     .setOutputCol("names_embeddings")    >>> router_city_embeddings = Router() \
-...     .setInputType("sentence_embeddings") \
-...     .setInputCols(["bert"]) \
-...     .setMetadataField("identifier") \
-...     .setFilterFieldsElements(["city"]) \
-...     .setOutputCol("cities_embeddings")
->>> router_names = Router() \
-...     .setInputType("chunk") \
-...     .setInputCols("regex") \
-...     .setMetadataField("identifier") \
-...     .setFilterFieldsElements(["name"]) \
-...     .setOutputCol("names_chunks")
->>> pipeline = Pipeline().setStages(
->>>     [document, sentence, regexMatcher, chunk2Doc, router_names, embeddings, router_name_embeddings,
-...      router_city_embeddings])
-
-
-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__([classname, java_model])

Initialize this instance with a Java model object.

clear(param)

Clears a param from the param map if it has been explicitly set.

copy([extra])

Creates a copy of this instance with the same uid and some extra params.

explainParam(param)

Explains a single param and returns its name, doc, and optional default value and user-supplied value in a string.

explainParams()

Returns the documentation of all params with their optionally default values and user-supplied values.

extractParamMap([extra])

Extracts the embedded default param values and user-supplied values, and then merges them with extra values from input into a flat param map, where the latter value is used if there exist conflicts, i.e., with ordering: default param values < user-supplied values < extra.

getInputCols()

Gets current column names of input annotations.

getLazyAnnotator()

Gets whether Annotator should be evaluated lazily in a RecursivePipeline.

getOrDefault(param)

Gets the value of a param in the user-supplied param map or its default value.

getOutputCol()

Gets output column name of annotations.

getParam(paramName)

Gets a param by its name.

getParamValue(paramName)

Gets the value of a parameter.

hasDefault(param)

Checks whether a param has a default value.

hasParam(paramName)

Tests whether this instance contains a param with a given (string) name.

isDefined(param)

Checks whether a param is explicitly set by user or has a default value.

isSet(param)

Checks whether a param is explicitly set by user.

load(path)

Reads an ML instance from the input path, a shortcut of read().load(path).

read()

Returns an MLReader instance for this class.

save(path)

Save this ML instance to the given path, a shortcut of 'write().save(path)'.

set(param, value)

Sets a parameter in the embedded param map.

setFilterFieldsElements(value)

Sets the filterfieldsElements are the allowed values for the metadata field that is being used

setInputCols(*value)

Sets column names of input annotations.

setInputType(value)

Sets the type of the entity that you want to filter by default sentence_embedding

setLazyAnnotator(value)

Sets whether Annotator should be evaluated lazily in a RecursivePipeline.

setMetadataField(value)

Sets the key in the metadata dictionary that you want to filter (by default 'entity')

setOutputCol(value)

Sets output column name of annotations.

setParamValue(paramName)

Sets the value of a parameter.

setParams()

transform(dataset[, params])

Transforms the input dataset with optional parameters.

write()

Returns an MLWriter instance for this ML instance.

-

Attributes

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

filterFieldsElements

getter_attrs

inputCols

inputType

lazyAnnotator

metadataField

name

outputCol

params

Returns all params ordered by name.

-
-
-clear(param)
-

Clears a param from the param map if it has been explicitly set.

-
- -
-
-copy(extra=None)
-

Creates a copy of this instance with the same uid and some -extra params. This implementation first calls Params.copy and -then make a copy of the companion Java pipeline component with -extra params. So both the Python wrapper and the Java pipeline -component get copied.

-
-
Parameters
-
-
extradict, optional

Extra parameters to copy to the new instance

-
-
-
-
Returns
-
-
JavaParams

Copy of this instance

-
-
-
-
-
- -
-
-explainParam(param)
-

Explains a single param and returns its name, doc, and optional -default value and user-supplied value in a string.

-
- -
-
-explainParams()
-

Returns the documentation of all params with their optionally -default values and user-supplied values.

-
- -
-
-extractParamMap(extra=None)
-

Extracts the embedded default param values and user-supplied -values, and then merges them with extra values from input into -a flat param map, where the latter value is used if there exist -conflicts, i.e., with ordering: default param values < -user-supplied values < extra.

-
-
Parameters
-
-
extradict, optional

extra param values

-
-
-
-
Returns
-
-
dict

merged param map

-
-
-
-
-
- -
-
-getInputCols()
-

Gets current column names of input annotations.

-
- -
-
-getLazyAnnotator()
-

Gets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
- -
-
-getOrDefault(param)
-

Gets the value of a param in the user-supplied param map or its -default value. Raises an error if neither is set.

-
- -
-
-getOutputCol()
-

Gets output column name of annotations.

-
- -
-
-getParam(paramName)
-

Gets a param by its name.

-
- -
-
-getParamValue(paramName)
-

Gets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-hasDefault(param)
-

Checks whether a param has a default value.

-
- -
-
-hasParam(paramName)
-

Tests whether this instance contains a param with a given -(string) name.

-
- -
-
-isDefined(param)
-

Checks whether a param is explicitly set by user or has -a default value.

-
- -
-
-isSet(param)
-

Checks whether a param is explicitly set by user.

-
- -
-
-classmethod load(path)
-

Reads an ML instance from the input path, a shortcut of read().load(path).

-
- -
-
-property params
-

Returns all params ordered by name. The default implementation -uses dir() to get all attributes of type -Param.

-
- -
-
-classmethod read()
-

Returns an MLReader instance for this class.

-
- -
-
-save(path)
-

Save this ML instance to the given path, a shortcut of ‘write().save(path)’.

-
- -
-
-set(param, value)
-

Sets a parameter in the embedded param map.

-
- -
-
-setFilterFieldsElements(value)[source]
-

Sets the filterfieldsElements are the allowed values for the metadata field that is being used

-
-
Parameters
-
-
valuelist

The filterfieldsElements are the allowed values for the metadata field that is being used

-
-
-
-
-
- -
-
-setInputCols(*value)
-

Sets column names of input annotations.

-
-
Parameters
-
-
*valuestr

Input columns for the annotator

-
-
-
-
-
- -
-
-setInputType(value)[source]
-

Sets the type of the entity that you want to filter by default sentence_embedding

-
-
Parameters
-
-
valueint

The type of the entity that you want to filter by default sentence_embedding

-
-
-
-
-
- -
-
-setLazyAnnotator(value)
-

Sets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
-
Parameters
-
-
valuebool

Whether Annotator should be evaluated lazily in a -RecursivePipeline

-
-
-
-
-
- -
-
-setMetadataField(value)[source]
-

Sets the key in the metadata dictionary that you want to filter (by default ‘entity’)

-
-
Parameters
-
-
valuestr

The key in the metadata dictionary that you want to filter (by default ‘entity’)

-
-
-
-
-
- -
-
-setOutputCol(value)
-

Sets output column name of annotations.

-
-
Parameters
-
-
valuestr

Name of output column

-
-
-
-
-
- -
-
-setParamValue(paramName)
-

Sets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-transform(dataset, params=None)
-

Transforms the input dataset with optional parameters.

-
-

New in version 1.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset

-
-
paramsdict, optional

an optional param map that overrides embedded params.

-
-
-
-
Returns
-
-
pyspark.sql.DataFrame

transformed dataset

-
-
-
-
-
- -
-
-uid
-

A unique id for the object.

-
- -
-
-write()
-

Returns an MLWriter instance for this ML instance.

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.SentenceEntityResolverApproach.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.SentenceEntityResolverApproach.html deleted file mode 100644 index d1d7a95c96..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.SentenceEntityResolverApproach.html +++ /dev/null @@ -1,1345 +0,0 @@ - - - - - - - - sparknlp_jsl.annotator.SentenceEntityResolverApproach — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.annotator.SentenceEntityResolverApproach

-
-
-class sparknlp_jsl.annotator.SentenceEntityResolverApproach[source]
-

Bases: sparknlp.common.AnnotatorApproach, sparknlp_jsl.annotator.SentenceResolverParams

-

Thius class contains all the parameters and methods to train a SentenceEntityResolverModel. -The model transforms a dataset with Input Annotation type SENTENCE_EMBEDDINGS, coming from e.g. -[BertSentenceEmbeddings](/docs/en/transformers#bertsentenceembeddings) -and returns the normalized entity for a particular trained ontology / curated dataset. -(e.g. ICD-10, RxNorm, SNOMED etc.)

- ---- - - - - - - - - - - -

Input Annotation types

Output Annotation type

SENTENCE_EMBEDDINGS

ENTITY

-
-
Parameters
-
-
labelCol

Column name for the value we are trying to resolve

-
-
normalizedCol

Column name for the original, normalized description

-
-
pretrainedModelPath

Path to an already trained SentenceEntityResolverModel, which is used as a starting point for training the new model.

-
-
overrideExistingCodes

Whether to override the existing codes with new data while continue the training from a pretrained model. Default value is false(keep all the codes).

-
-
returnCosineDistances

Extract Cosine Distances. TRUE or False

-
-
aux_label_col

Auxiliary label which maps resolved entities to additional labels

-
-
useAuxLabel

Use AuxLabel Col or not

-
-
overrideExistingCodes

Whether to override the codes present in a pretrained model with new codes when the training process begins with a pretrained model

-
-
dropCodesList

A list of codes in a pretrained model that will be omitted when the training process begins with a pretrained model

-
-
-
-
-

Examples

-
>>> import sparknlp
->>> from sparknlp.base import *
->>> from sparknlp.common import *
->>> from sparknlp.annotator import *
->>> from sparknlp.training import *
->>> import sparknlp_jsl
->>> from sparknlp_jsl.base import *
->>> from sparknlp_jsl.annotator import *
->>> from pyspark.ml import Pipeline
->>> documentAssembler = DocumentAssembler().setInputCol("text").setOutputCol("document")
->>> sentenceDetector = SentenceDetector().setInputCols(["document"]).setOutputCol("sentence")
->>> tokenizer = Tokenizer().setInputCols(["sentence"]).setOutputCol("token")
->>> bertEmbeddings = BertSentenceEmbeddings.pretrained("sent_biobert_pubmed_base_cased") \
-...  .setInputCols(["sentence"]) \
-...  .setOutputCol("embeddings")
->>> snomedTrainingPipeline = Pipeline(stages=[
-...  documentAssembler,
-...  sentenceDetector,
-...  bertEmbeddings,
-... ])
->>> snomedTrainingModel = snomedTrainingPipeline.fit(data)
->>> snomedData = snomedTrainingModel.transform(data).cache()
->>> assertionModel = assertionPipeline.fit(data)
->>> assertionModel = assertionPipeline.fit(data)
-
-
-
>>> bertExtractor = SentenceEntityResolverApproach() \
-...   .setNeighbours(25) \
-...   .setThreshold(1000) \
-...   .setInputCols(["bert_embeddings"]) \
-...   .setNormalizedCol("normalized_text") \
-...   .setLabelCol("label") \
-...   .setOutputCol("snomed_code") \
-...   .setDistanceFunction("EUCLIDIAN") \
-...   .setCaseSensitive(False)
-
-
-
>>> snomedModel = bertExtractor.fit(snomedData)
-
-
-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__()

clear(param)

Clears a param from the param map if it has been explicitly set.

copy([extra])

Creates a copy of this instance with the same uid and some extra params.

explainParam(param)

Explains a single param and returns its name, doc, and optional default value and user-supplied value in a string.

explainParams()

Returns the documentation of all params with their optionally default values and user-supplied values.

extractParamMap([extra])

Extracts the embedded default param values and user-supplied values, and then merges them with extra values from input into a flat param map, where the latter value is used if there exist conflicts, i.e., with ordering: default param values < user-supplied values < extra.

fit(dataset[, params])

Fits a model to the input dataset with optional parameters.

fitMultiple(dataset, paramMaps)

Fits a model to the input dataset for each param map in paramMaps.

getCaseSensitive()

Gets whether to ignore case in tokens for embeddings matching.

getInputCols()

Gets current column names of input annotations.

getLazyAnnotator()

Gets whether Annotator should be evaluated lazily in a RecursivePipeline.

getOrDefault(param)

Gets the value of a param in the user-supplied param map or its default value.

getOutputCol()

Gets output column name of annotations.

getParam(paramName)

Gets a param by its name.

getParamValue(paramName)

Gets the value of a parameter.

hasDefault(param)

Checks whether a param has a default value.

hasParam(paramName)

Tests whether this instance contains a param with a given (string) name.

isDefined(param)

Checks whether a param is explicitly set by user or has a default value.

isSet(param)

Checks whether a param is explicitly set by user.

load(path)

Reads an ML instance from the input path, a shortcut of read().load(path).

read()

Returns an MLReader instance for this class.

save(path)

Save this ML instance to the given path, a shortcut of 'write().save(path)'.

set(param, value)

Sets a parameter in the embedded param map.

setAuxLabelCol(name)

Sets auxiliary label which maps resolved entities to additional labels

setCaseSensitive(value)

Sets whether to ignore case in tokens for embeddings matching.

setConfidenceFunction(s)

What function to use to calculate confidence: INVERSE or SOFTMAX.

setDistanceFunction(dist)

Sets distance function to use for WMD: 'EUCLIDEAN' or 'COSINE'.

setDropCodesList(value)

setExtractCosineDistances(name)

Extract Cosine Distances.

setInputCols(*value)

Sets column names of input annotations.

setLabelCol(name)

Sets column name for the value we are trying to resolve

setLazyAnnotator(value)

Sets whether Annotator should be evaluated lazily in a RecursivePipeline.

setMissAsEmpty(value)

Sets whether or not to return an empty annotation on unmatched chunks.

setNeighbours(k)

Sets number of neighbours to consider in the KNN query to calculate WMD.

setNormalizedCol(name)

Sets column name for the original, normalized description

setOutputCol(value)

Sets output column name of annotations.

setOverrideExistingCodes(value)

setParamValue(paramName)

Sets the value of a parameter.

setPretrainedModelPath(path)

setThreshold(thres)

Sets Threshold value for the last distance calculated.

setUseAuxLabel(name)

Sets Use AuxLabel Col or not.

write()

Returns an MLWriter instance for this ML instance.

-

Attributes

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

aux_label_col

caseSensitive

confidenceFunction

distanceFunction

dropCodesList

getter_attrs

inputCols

labelCol

lazyAnnotator

missAsEmpty

neighbours

normalizedCol

outputCol

overrideExistingCodes

params

Returns all params ordered by name.

pretrainedModelPath

returnCosineDistances

threshold

useAuxLabel

-
-
-clear(param)
-

Clears a param from the param map if it has been explicitly set.

-
- -
-
-copy(extra=None)
-

Creates a copy of this instance with the same uid and some -extra params. This implementation first calls Params.copy and -then make a copy of the companion Java pipeline component with -extra params. So both the Python wrapper and the Java pipeline -component get copied.

-
-
Parameters
-
-
extradict, optional

Extra parameters to copy to the new instance

-
-
-
-
Returns
-
-
JavaParams

Copy of this instance

-
-
-
-
-
- -
-
-explainParam(param)
-

Explains a single param and returns its name, doc, and optional -default value and user-supplied value in a string.

-
- -
-
-explainParams()
-

Returns the documentation of all params with their optionally -default values and user-supplied values.

-
- -
-
-extractParamMap(extra=None)
-

Extracts the embedded default param values and user-supplied -values, and then merges them with extra values from input into -a flat param map, where the latter value is used if there exist -conflicts, i.e., with ordering: default param values < -user-supplied values < extra.

-
-
Parameters
-
-
extradict, optional

extra param values

-
-
-
-
Returns
-
-
dict

merged param map

-
-
-
-
-
- -
-
-fit(dataset, params=None)
-

Fits a model to the input dataset with optional parameters.

-
-

New in version 1.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset.

-
-
paramsdict or list or tuple, optional

an optional param map that overrides embedded params. If a list/tuple of -param maps is given, this calls fit on each param map and returns a list of -models.

-
-
-
-
Returns
-
-
Transformer or a list of Transformer

fitted model(s)

-
-
-
-
-
- -
-
-fitMultiple(dataset, paramMaps)
-

Fits a model to the input dataset for each param map in paramMaps.

-
-

New in version 2.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset.

-
-
paramMapscollections.abc.Sequence

A Sequence of param maps.

-
-
-
-
Returns
-
-
_FitMultipleIterator

A thread safe iterable which contains one model for each param map. Each -call to next(modelIterator) will return (index, model) where model was fit -using paramMaps[index]. index values may not be sequential.

-
-
-
-
-
- -
-
-getCaseSensitive()
-

Gets whether to ignore case in tokens for embeddings matching.

-
-
Returns
-
-
bool

Whether to ignore case in tokens for embeddings matching

-
-
-
-
-
- -
-
-getInputCols()
-

Gets current column names of input annotations.

-
- -
-
-getLazyAnnotator()
-

Gets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
- -
-
-getOrDefault(param)
-

Gets the value of a param in the user-supplied param map or its -default value. Raises an error if neither is set.

-
- -
-
-getOutputCol()
-

Gets output column name of annotations.

-
- -
-
-getParam(paramName)
-

Gets a param by its name.

-
- -
-
-getParamValue(paramName)
-

Gets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-hasDefault(param)
-

Checks whether a param has a default value.

-
- -
-
-hasParam(paramName)
-

Tests whether this instance contains a param with a given -(string) name.

-
- -
-
-isDefined(param)
-

Checks whether a param is explicitly set by user or has -a default value.

-
- -
-
-isSet(param)
-

Checks whether a param is explicitly set by user.

-
- -
-
-classmethod load(path)
-

Reads an ML instance from the input path, a shortcut of read().load(path).

-
- -
-
-property params
-

Returns all params ordered by name. The default implementation -uses dir() to get all attributes of type -Param.

-
- -
-
-classmethod read()
-

Returns an MLReader instance for this class.

-
- -
-
-save(path)
-

Save this ML instance to the given path, a shortcut of ‘write().save(path)’.

-
- -
-
-set(param, value)
-

Sets a parameter in the embedded param map.

-
- -
-
-setAuxLabelCol(name)[source]
-

Sets auxiliary label which maps resolved entities to additional labels

-
-
Parameters
-
-
namestr

Auxiliary label which maps resolved entities to additional labels

-
-
-
-
-
- -
-
-setCaseSensitive(value)
-

Sets whether to ignore case in tokens for embeddings matching.

-
-
Parameters
-
-
valuebool

Whether to ignore case in tokens for embeddings matching

-
-
-
-
-
- -
-
-setConfidenceFunction(s)
-

What function to use to calculate confidence: INVERSE or SOFTMAX.

-
-
Parameters
-
-
sstr

What function to use to calculate confidence: INVERSE or SOFTMAX.

-
-
-
-
-
- -
-
-setDistanceFunction(dist)
-

Sets distance function to use for WMD: ‘EUCLIDEAN’ or ‘COSINE’.

-
-
Parameters
-
-
diststr

Value that selects what distance function to use for WMD: ‘EUCLIDEAN’ or ‘COSINE’.

-
-
-
-
-
- -
-
-setExtractCosineDistances(name)[source]
-

Extract Cosine Distances. TRUE or False.

-
-
Parameters
-
-
namebool

Extract Cosine Distances. TRUE or False

-
-
-
-
-
- -
-
-setInputCols(*value)
-

Sets column names of input annotations.

-
-
Parameters
-
-
*valuestr

Input columns for the annotator

-
-
-
-
-
- -
-
-setLabelCol(name)[source]
-

Sets column name for the value we are trying to resolve

-
-
Parameters
-
-
sbool

Column name for the value we are trying to resolve

-
-
-
-
-
- -
-
-setLazyAnnotator(value)
-

Sets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
-
Parameters
-
-
valuebool

Whether Annotator should be evaluated lazily in a -RecursivePipeline

-
-
-
-
-
- -
-
-setMissAsEmpty(value)
-

Sets whether or not to return an empty annotation on unmatched chunks.

-
-
Parameters
-
-
valuebool

whether or not to return an empty annotation on unmatched chunks.

-
-
-
-
-
- -
-
-setNeighbours(k)
-

Sets number of neighbours to consider in the KNN query to calculate WMD.

-
-
Parameters
-
-
kint

Number of neighbours to consider in the KNN query to calculate WMD.

-
-
-
-
-
- -
-
-setNormalizedCol(name)[source]
-

Sets column name for the original, normalized description

-
-
Parameters
-
-
sbool

Column name for the original, normalized description

-
-
-
-
-
- -
-
-setOutputCol(value)
-

Sets output column name of annotations.

-
-
Parameters
-
-
valuestr

Name of output column

-
-
-
-
-
- -
-
-setParamValue(paramName)
-

Sets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-setThreshold(thres)
-

Sets Threshold value for the last distance calculated.

-
-
Parameters
-
-
thresfloat

Threshold value for the last distance calculated.

-
-
-
-
-
- -
-
-setUseAuxLabel(name)[source]
-

Sets Use AuxLabel Col or not.

-
-
Parameters
-
-
namebool

Use AuxLabel Col or not.

-
-
-
-
-
- -
-
-uid
-

A unique id for the object.

-
- -
-
-write()
-

Returns an MLWriter instance for this ML instance.

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.SentenceEntityResolverModel.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.SentenceEntityResolverModel.html deleted file mode 100644 index 0d07298c64..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.SentenceEntityResolverModel.html +++ /dev/null @@ -1,1385 +0,0 @@ - - - - - - - - sparknlp_jsl.annotator.SentenceEntityResolverModel — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.annotator.SentenceEntityResolverModel

-
-
-class sparknlp_jsl.annotator.SentenceEntityResolverModel(classname='com.johnsnowlabs.nlp.annotators.resolution.SentenceEntityResolverModel', java_model=None)[source]
-

Bases: sparknlp.common.AnnotatorModel, sparknlp.common.HasEmbeddingsProperties, sparknlp.common.HasStorageModel, sparknlp_jsl.annotator.SentenceResolverParams

-

Thius class contains all the parameters and methods to train a SentenceEntityResolverModel. -The model transforms a dataset with Input Annotation type SENTENCE_EMBEDDINGS, coming from e.g. -[BertSentenceEmbeddings](/docs/en/transformers#bertsentenceembeddings) -and returns the normalized entity for a particular trained ontology / curated dataset. -(e.g. ICD-10, RxNorm, SNOMED etc.)

- ---- - - - - - - - - - - -

Input Annotation types

Output Annotation type

SENTENCE_EMBEDDINGS

ENTITY

-
-
Parameters
-
-
returnCosineDistances

Extract Cosine Distances. TRUE or False

-
-
aux_label_col

Auxiliary label which maps resolved entities to additional labels

-
-
useAuxLabel

Use AuxLabel Col or not

-
-
searchTree

Search tree for resolution

-
-
-
-
-

Examples

-
>>> import sparknlp
->>> from sparknlp.base import *
->>> from sparknlp.common import *
->>> from sparknlp.annotator import *
->>> from sparknlp.training import *
->>> import sparknlp_jsl
->>> from sparknlp_jsl.base import *
->>> from sparknlp_jsl.annotator import *
->>> from pyspark.ml import Pipeline
->>> documentAssembler = DocumentAssembler().setInputCol("text").setOutputCol("document")
->>> sentenceDetector = SentenceDetector().setInputCols(["document"]).setOutputCol("sentence")
->>> tokenizer = Tokenizer().setInputCols(["sentence"]).setOutputCol("token")
->>> bertEmbeddings = BertSentenceEmbeddings.pretrained("sent_biobert_pubmed_base_cased") \
-...  .setInputCols(["sentence"]) \
-...  .setOutputCol("embeddings")
->>> snomedTrainingPipeline = Pipeline(stages=[
-...  documentAssembler,
-...  sentenceDetector,
-...  bertEmbeddings,
-... ])
->>> snomedTrainingModel = snomedTrainingPipeline.fit(data)
->>> snomedData = snomedTrainingModel.transform(data).cache()
->>> assertionModel = assertionPipeline.fit(data)
->>> assertionModel = assertionPipeline.fit(data)
-
-
-
>>> bertExtractor = SentenceEntityResolverApproach() \
-...   .setNeighbours(25) \
-...   .setThreshold(1000) \
-...   .setInputCols(["bert_embeddings"]) \
-...   .setNormalizedCol("normalized_text") \
-...   .setLabelCol("label") \
-...   .setOutputCol("snomed_code") \
-...   .setDistanceFunction("EUCLIDIAN") \
-...   .setCaseSensitive(False)
-
-
-
>>> snomedModel = bertExtractor.fit(snomedData)
-
-
-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__([classname, java_model])

Initialize this instance with a Java model object.

clear(param)

Clears a param from the param map if it has been explicitly set.

copy([extra])

Creates a copy of this instance with the same uid and some extra params.

explainParam(param)

Explains a single param and returns its name, doc, and optional default value and user-supplied value in a string.

explainParams()

Returns the documentation of all params with their optionally default values and user-supplied values.

extractParamMap([extra])

Extracts the embedded default param values and user-supplied values, and then merges them with extra values from input into a flat param map, where the latter value is used if there exist conflicts, i.e., with ordering: default param values < user-supplied values < extra.

getCaseSensitive()

Gets whether to ignore case in tokens for embeddings matching.

getDimension()

Gets embeddings dimension.

getIncludeStorage()

Gets whether to include indexed storage in trained model.

getInputCols()

Gets current column names of input annotations.

getLazyAnnotator()

Gets whether Annotator should be evaluated lazily in a RecursivePipeline.

getOrDefault(param)

Gets the value of a param in the user-supplied param map or its default value.

getOutputCol()

Gets output column name of annotations.

getParam(paramName)

Gets a param by its name.

getParamValue(paramName)

Gets the value of a parameter.

getStorageRef()

Gets unique reference name for identification.

hasDefault(param)

Checks whether a param has a default value.

hasParam(paramName)

Tests whether this instance contains a param with a given (string) name.

isDefined(param)

Checks whether a param is explicitly set by user or has a default value.

isSet(param)

Checks whether a param is explicitly set by user.

load(path)

Reads an ML instance from the input path, a shortcut of read().load(path).

loadStorage(path, spark, storage_ref)

loadStorages(path, spark, storage_ref, databases)

pretrained(name[, lang, remote_loc])

read()

Returns an MLReader instance for this class.

save(path)

Save this ML instance to the given path, a shortcut of 'write().save(path)'.

saveStorage(path, spark)

Saves the current model to storage.

set(param, value)

Sets a parameter in the embedded param map.

setAuxLabelCol(name)

Sets auxiliary label which maps resolved entities to additional labels

setCaseSensitive(value)

Sets whether to ignore case in tokens for embeddings matching.

setConfidenceFunction(s)

What function to use to calculate confidence: INVERSE or SOFTMAX.

setDimension(value)

Sets embeddings dimension.

setDistanceFunction(dist)

Sets distance function to use for WMD: 'EUCLIDEAN' or 'COSINE'.

setIncludeStorage(value)

Sets whether to include indexed storage in trained model.

setInputCols(*value)

Sets column names of input annotations.

setLazyAnnotator(value)

Sets whether Annotator should be evaluated lazily in a RecursivePipeline.

setMissAsEmpty(value)

Sets whether or not to return an empty annotation on unmatched chunks.

setNeighbours(k)

Sets number of neighbours to consider in the KNN query to calculate WMD.

setOutputCol(value)

Sets output column name of annotations.

setParamValue(paramName)

Sets the value of a parameter.

setParams()

setSearchTree(s)

Sets auxiliary label which maps resolved entities to additional labels

setStorageRef(value)

Sets unique reference name for identification.

setThreshold(thres)

Sets Threshold value for the last distance calculated.

setUseAuxLabel(name)

Sets Use AuxLabel Col or not.

transform(dataset[, params])

Transforms the input dataset with optional parameters.

write()

Returns an MLWriter instance for this ML instance.

-

Attributes

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

aux_label_col

caseSensitive

confidenceFunction

dimension

distanceFunction

getter_attrs

includeStorage

inputCols

lazyAnnotator

missAsEmpty

name

neighbours

outputCol

params

Returns all params ordered by name.

returnCosineDistances

searchTree

storageRef

threshold

useAuxLabel

-
-
-clear(param)
-

Clears a param from the param map if it has been explicitly set.

-
- -
-
-copy(extra=None)
-

Creates a copy of this instance with the same uid and some -extra params. This implementation first calls Params.copy and -then make a copy of the companion Java pipeline component with -extra params. So both the Python wrapper and the Java pipeline -component get copied.

-
-
Parameters
-
-
extradict, optional

Extra parameters to copy to the new instance

-
-
-
-
Returns
-
-
JavaParams

Copy of this instance

-
-
-
-
-
- -
-
-explainParam(param)
-

Explains a single param and returns its name, doc, and optional -default value and user-supplied value in a string.

-
- -
-
-explainParams()
-

Returns the documentation of all params with their optionally -default values and user-supplied values.

-
- -
-
-extractParamMap(extra=None)
-

Extracts the embedded default param values and user-supplied -values, and then merges them with extra values from input into -a flat param map, where the latter value is used if there exist -conflicts, i.e., with ordering: default param values < -user-supplied values < extra.

-
-
Parameters
-
-
extradict, optional

extra param values

-
-
-
-
Returns
-
-
dict

merged param map

-
-
-
-
-
- -
-
-getCaseSensitive()
-

Gets whether to ignore case in tokens for embeddings matching.

-
-
Returns
-
-
bool

Whether to ignore case in tokens for embeddings matching

-
-
-
-
-
- -
-
-getDimension()
-

Gets embeddings dimension.

-
- -
-
-getIncludeStorage()
-

Gets whether to include indexed storage in trained model.

-
-
Returns
-
-
bool

Whether to include indexed storage in trained model

-
-
-
-
-
- -
-
-getInputCols()
-

Gets current column names of input annotations.

-
- -
-
-getLazyAnnotator()
-

Gets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
- -
-
-getOrDefault(param)
-

Gets the value of a param in the user-supplied param map or its -default value. Raises an error if neither is set.

-
- -
-
-getOutputCol()
-

Gets output column name of annotations.

-
- -
-
-getParam(paramName)
-

Gets a param by its name.

-
- -
-
-getParamValue(paramName)
-

Gets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-getStorageRef()
-

Gets unique reference name for identification.

-
-
Returns
-
-
str

Unique reference name for identification

-
-
-
-
-
- -
-
-hasDefault(param)
-

Checks whether a param has a default value.

-
- -
-
-hasParam(paramName)
-

Tests whether this instance contains a param with a given -(string) name.

-
- -
-
-isDefined(param)
-

Checks whether a param is explicitly set by user or has -a default value.

-
- -
-
-isSet(param)
-

Checks whether a param is explicitly set by user.

-
- -
-
-classmethod load(path)
-

Reads an ML instance from the input path, a shortcut of read().load(path).

-
- -
-
-property params
-

Returns all params ordered by name. The default implementation -uses dir() to get all attributes of type -Param.

-
- -
-
-classmethod read()
-

Returns an MLReader instance for this class.

-
- -
-
-save(path)
-

Save this ML instance to the given path, a shortcut of ‘write().save(path)’.

-
- -
-
-saveStorage(path, spark)
-

Saves the current model to storage.

-
-
Parameters
-
-
pathstr

Path for saving the model.

-
-
sparkpyspark.sql.SparkSession

The current SparkSession

-
-
-
-
-
- -
-
-set(param, value)
-

Sets a parameter in the embedded param map.

-
- -
-
-setAuxLabelCol(name)[source]
-

Sets auxiliary label which maps resolved entities to additional labels

-
-
Parameters
-
-
namestr

Auxiliary label which maps resolved entities to additional labels

-
-
-
-
-
- -
-
-setCaseSensitive(value)
-

Sets whether to ignore case in tokens for embeddings matching.

-
-
Parameters
-
-
valuebool

Whether to ignore case in tokens for embeddings matching

-
-
-
-
-
- -
-
-setConfidenceFunction(s)
-

What function to use to calculate confidence: INVERSE or SOFTMAX.

-
-
Parameters
-
-
sstr

What function to use to calculate confidence: INVERSE or SOFTMAX.

-
-
-
-
-
- -
-
-setDimension(value)
-

Sets embeddings dimension.

-
-
Parameters
-
-
valueint

Embeddings dimension

-
-
-
-
-
- -
-
-setDistanceFunction(dist)
-

Sets distance function to use for WMD: ‘EUCLIDEAN’ or ‘COSINE’.

-
-
Parameters
-
-
diststr

Value that selects what distance function to use for WMD: ‘EUCLIDEAN’ or ‘COSINE’.

-
-
-
-
-
- -
-
-setIncludeStorage(value)
-

Sets whether to include indexed storage in trained model.

-
-
Parameters
-
-
valuebool

Whether to include indexed storage in trained model

-
-
-
-
-
- -
-
-setInputCols(*value)
-

Sets column names of input annotations.

-
-
Parameters
-
-
*valuestr

Input columns for the annotator

-
-
-
-
-
- -
-
-setLazyAnnotator(value)
-

Sets whether Annotator should be evaluated lazily in a -RecursivePipeline.

-
-
Parameters
-
-
valuebool

Whether Annotator should be evaluated lazily in a -RecursivePipeline

-
-
-
-
-
- -
-
-setMissAsEmpty(value)
-

Sets whether or not to return an empty annotation on unmatched chunks.

-
-
Parameters
-
-
valuebool

whether or not to return an empty annotation on unmatched chunks.

-
-
-
-
-
- -
-
-setNeighbours(k)
-

Sets number of neighbours to consider in the KNN query to calculate WMD.

-
-
Parameters
-
-
kint

Number of neighbours to consider in the KNN query to calculate WMD.

-
-
-
-
-
- -
-
-setOutputCol(value)
-

Sets output column name of annotations.

-
-
Parameters
-
-
valuestr

Name of output column

-
-
-
-
-
- -
-
-setParamValue(paramName)
-

Sets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-setSearchTree(s)[source]
-

Sets auxiliary label which maps resolved entities to additional labels

-
-
Parameters
-
-
namestr

Auxiliary label which maps resolved entities to additional labels

-
-
-
-
-
- -
-
-setStorageRef(value)
-

Sets unique reference name for identification.

-
-
Parameters
-
-
valuestr

Unique reference name for identification

-
-
-
-
-
- -
-
-setThreshold(thres)
-

Sets Threshold value for the last distance calculated.

-
-
Parameters
-
-
thresfloat

Threshold value for the last distance calculated.

-
-
-
-
-
- -
-
-setUseAuxLabel(name)[source]
-

Sets Use AuxLabel Col or not.

-
-
Parameters
-
-
namebool

Use AuxLabel Col or not.

-
-
-
-
-
- -
-
-transform(dataset, params=None)
-

Transforms the input dataset with optional parameters.

-
-

New in version 1.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset

-
-
paramsdict, optional

an optional param map that overrides embedded params.

-
-
-
-
Returns
-
-
pyspark.sql.DataFrame

transformed dataset

-
-
-
-
-
- -
-
-uid
-

A unique id for the object.

-
- -
-
-write()
-

Returns an MLWriter instance for this ML instance.

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.SentenceResolverParams.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.SentenceResolverParams.html deleted file mode 100644 index e3548614f5..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.SentenceResolverParams.html +++ /dev/null @@ -1,769 +0,0 @@ - - - - - - - - sparknlp_jsl.annotator.SentenceResolverParams — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.annotator.SentenceResolverParams

-
-
-class sparknlp_jsl.annotator.SentenceResolverParams[source]
-

Bases: sparknlp.common.HasCaseSensitiveProperties

-

Class used to have a common interface Sentence Resolver family.

-
-
Parameters
-
-
distanceFunction

What distance function to use for WMD: ‘EUCLIDEAN’ or ‘COSINE’.

-
-
neighbours

Number of neighbours to consider in the KNN query to calculate WMD

-
-
threshold

Threshold value for the last distance calculated.

-
-
confidenceFunction

What function to use to calculate confidence: INVERSE or SOFTMAX.

-
-
missAsEmpty

Whether or not to return an empty annotation on unmatched chunks.

-
-
-
-
-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__(*args, **kwargs)

getCaseSensitive()

Gets whether to ignore case in tokens for embeddings matching.

setCaseSensitive(value)

Sets whether to ignore case in tokens for embeddings matching.

setConfidenceFunction(s)

What function to use to calculate confidence: INVERSE or SOFTMAX.

setDistanceFunction(dist)

Sets distance function to use for WMD: 'EUCLIDEAN' or 'COSINE'.

setMissAsEmpty(value)

Sets whether or not to return an empty annotation on unmatched chunks.

setNeighbours(k)

Sets number of neighbours to consider in the KNN query to calculate WMD.

setThreshold(thres)

Sets Threshold value for the last distance calculated.

-

Attributes

- ---- - - - - - - - - - - - - - - - - - - - - -

caseSensitive

confidenceFunction

distanceFunction

missAsEmpty

neighbours

threshold

-
-
-getCaseSensitive()
-

Gets whether to ignore case in tokens for embeddings matching.

-
-
Returns
-
-
bool

Whether to ignore case in tokens for embeddings matching

-
-
-
-
-
- -
-
-setCaseSensitive(value)
-

Sets whether to ignore case in tokens for embeddings matching.

-
-
Parameters
-
-
valuebool

Whether to ignore case in tokens for embeddings matching

-
-
-
-
-
- -
-
-setConfidenceFunction(s)[source]
-

What function to use to calculate confidence: INVERSE or SOFTMAX.

-
-
Parameters
-
-
sstr

What function to use to calculate confidence: INVERSE or SOFTMAX.

-
-
-
-
-
- -
-
-setDistanceFunction(dist)[source]
-

Sets distance function to use for WMD: ‘EUCLIDEAN’ or ‘COSINE’.

-
-
Parameters
-
-
diststr

Value that selects what distance function to use for WMD: ‘EUCLIDEAN’ or ‘COSINE’.

-
-
-
-
-
- -
-
-setMissAsEmpty(value)[source]
-

Sets whether or not to return an empty annotation on unmatched chunks.

-
-
Parameters
-
-
valuebool

whether or not to return an empty annotation on unmatched chunks.

-
-
-
-
-
- -
-
-setNeighbours(k)[source]
-

Sets number of neighbours to consider in the KNN query to calculate WMD.

-
-
Parameters
-
-
kint

Number of neighbours to consider in the KNN query to calculate WMD.

-
-
-
-
-
- -
-
-setThreshold(thres)[source]
-

Sets Threshold value for the last distance calculated.

-
-
Parameters
-
-
thresfloat

Threshold value for the last distance calculated.

-
-
-
-
-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.html deleted file mode 100644 index b4342eaabd..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.annotator.html +++ /dev/null @@ -1,746 +0,0 @@ - - - - - - - - sparknlp_jsl.annotator — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.annotator

-

Classes

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

AnnotationMerger

Merges Annotations from multiple columns.

AssertionDLApproach

Train a Assertion Model algorithm using deep learning.

AssertionDLModel

AssertionDL is a deep Learning based approach used to extract Assertion Status from extracted entities and text.

AssertionFilterer

Filters entities coming from ASSERTION type annotations and returns the CHUNKS.

AssertionLogRegApproach

Train a Assertion algorithm using a regression log model.

AssertionLogRegModel

This is a main class in AssertionLogReg family. Logarithmic Regression is used to extract Assertion Status

AverageEmbeddings

BertSentenceChunkEmbeddings

BERT Sentence embeddings for chunk annotations which take into account the context of the sentence the chunk appeared in.

Chunk2Token

ChunkConverter

Convert chunks from regexMatcher to chunks with a entity in the metadata.

ChunkFilterer

Model that Filters entities coming from CHUNK annotations. Filters can be set via a white list of terms or a regular expression.

ChunkFiltererApproach

Model that Filters entities coming from CHUNK annotations. Filters can be set via a white list of terms or a regular expression.

ChunkKeyPhraseExtraction

Chunk KeyPhrase Extraction uses Bert Sentence Embeddings to determine the most relevant key phrases describing a text.

ChunkMergeApproach

Merges two chunk columns coming from two annotators(NER, ContextualParser or any other annotator producing chunks).

ChunkMergeModel

The model produced by ChunkMergerAproach.

ChunkSentenceSplitter

Split the document using the chunks that you provided,and put in the metadata the chunk entity.

CommonResolverParams

Class used to have a common interface Entity Resolver family.

ContextualParserApproach

Creates a model, that extracts entity from a document based on user defined rules.

ContextualParserModel

Extracts entity from a document based on user defined rules.

DateNormalizer

Try to normalize dates in chunks annotations.

DeIdentification

Contains all the methods for training a DeIdentificationModel model.

DeIdentificationModel

The DeIdentificationModel model can obfuscate or mask the entities that contains personal information.

DocumentLogRegClassifierApproach

Trains a model to classify documents with a Logarithmic Regression algorithm.

DocumentLogRegClassifierModel

Classifies documents with a Logarithmic Regression algorithm.

DrugNormalizer

Annotator which normalizes raw text from clinical documents, e.g. scraped web pages or xml documents, from document type columns into Sentence.

EntityChunkEmbeddings

Weighted average embeddings of multiple named entities chunk annotations

GenericClassifierApproach

Trains a TensorFlow model for generic classification of feature vectors.

GenericClassifierModel

Generic classifier of feature vectors.

IOBTagger

Merges token tags and NER labels from chunks in the specified format.

MedicalBertForSequenceClassification

MedicalBertForTokenClassifier can load Bert Models with sequence classification/regression head on top (a linear layer on top of the pooled output) e.g.

MedicalBertForTokenClassifier

MedicalBertForTokenClassifier can load Bert Models with a token classification head on top (a linear layer on top of the hidden-states output) e.g.

MedicalDistilBertForSequenceClassification

MedicalDistilBertForSequenceClassification can load DistilBERT Models with sequence classification/regression head on top (a linear layer on top of the pooled output) e.g.

MedicalNerApproach

This Named Entity recognition annotator allows to train generic NER model based on Neural Networks.

MedicalNerModel

This Named Entity recognition annotator is a generic NER model based on Neural Networks.

NerChunker

Extracts phrases that fits into a known pattern using the NER tags. Useful for entity groups with neighboring tokens

NerConverterInternal

Converts a IOB or IOB2 representation of NER to a user-friendly one, by associating the tokens of recognized entities and their label.

NerDisambiguator

Links words of interest, such as names of persons, locations and companies, from an input text document to a corresponding unique entity in a target Knowledge Base (KB).

NerDisambiguatorModel

Links words of interest, such as names of persons, locations and companies, from an input text document to a corresponding unique entity in a target Knowledge Base (KB).

PosologyREModel

RENerChunksFilter

Filters and outputs combinations of relations between extracted entities, for further processing.

ReIdentification

RelationExtractionApproach

Trains a TensorFlow model for relation extraction.

RelationExtractionDLModel

Extracts and classifies instances of relations between named entities.

RelationExtractionModel

Trains a TensorFlow model for relation extraction.

Router

Convert chunks from regexMatcher to chunks with a entity in the metadata.

SentenceEntityResolverApproach

Thius class contains all the parameters and methods to train a SentenceEntityResolverModel.

SentenceEntityResolverModel

Thius class contains all the parameters and methods to train a SentenceEntityResolverModel.

SentenceResolverParams

Class used to have a common interface Sentence Resolver family.

-
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.base.FeaturesAssembler.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.base.FeaturesAssembler.html deleted file mode 100644 index fef3e80bdf..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.base.FeaturesAssembler.html +++ /dev/null @@ -1,918 +0,0 @@ - - - - - - - - sparknlp_jsl.base.FeaturesAssembler — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.base.FeaturesAssembler

-
-
-class sparknlp_jsl.base.FeaturesAssembler[source]
-

Bases: sparknlp.internal.AnnotatorTransformer

-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__()

clear(param)

Clears a param from the param map if it has been explicitly set.

copy([extra])

Creates a copy of this instance with the same uid and some extra params.

explainParam(param)

Explains a single param and returns its name, doc, and optional default value and user-supplied value in a string.

explainParams()

Returns the documentation of all params with their optionally default values and user-supplied values.

extractParamMap([extra])

Extracts the embedded default param values and user-supplied values, and then merges them with extra values from input into a flat param map, where the latter value is used if there exist conflicts, i.e., with ordering: default param values < user-supplied values < extra.

getOrDefault(param)

Gets the value of a param in the user-supplied param map or its default value.

getParam(paramName)

Gets a param by its name.

getParamValue(paramName)

Gets the value of a parameter.

hasDefault(param)

Checks whether a param has a default value.

hasParam(paramName)

Tests whether this instance contains a param with a given (string) name.

isDefined(param)

Checks whether a param is explicitly set by user or has a default value.

isSet(param)

Checks whether a param is explicitly set by user.

load(path)

Reads an ML instance from the input path, a shortcut of read().load(path).

read()

Returns an MLReader instance for this class.

save(path)

Save this ML instance to the given path, a shortcut of 'write().save(path)'.

set(param, value)

Sets a parameter in the embedded param map.

setInputCols(value)

Sets input columns name.

setOutputCol(value)

Sets output column name.

setParamValue(paramName)

Sets the value of a parameter.

setParams()

transform(dataset[, params])

Transforms the input dataset with optional parameters.

write()

Returns an MLWriter instance for this ML instance.

-

Attributes

- ---- - - - - - - - - - - - - - - - - - -

getter_attrs

inputCols

name

outputCol

params

Returns all params ordered by name.

-
-
-clear(param)
-

Clears a param from the param map if it has been explicitly set.

-
- -
-
-copy(extra=None)
-

Creates a copy of this instance with the same uid and some -extra params. This implementation first calls Params.copy and -then make a copy of the companion Java pipeline component with -extra params. So both the Python wrapper and the Java pipeline -component get copied.

-
-
Parameters
-
-
extradict, optional

Extra parameters to copy to the new instance

-
-
-
-
Returns
-
-
JavaParams

Copy of this instance

-
-
-
-
-
- -
-
-explainParam(param)
-

Explains a single param and returns its name, doc, and optional -default value and user-supplied value in a string.

-
- -
-
-explainParams()
-

Returns the documentation of all params with their optionally -default values and user-supplied values.

-
- -
-
-extractParamMap(extra=None)
-

Extracts the embedded default param values and user-supplied -values, and then merges them with extra values from input into -a flat param map, where the latter value is used if there exist -conflicts, i.e., with ordering: default param values < -user-supplied values < extra.

-
-
Parameters
-
-
extradict, optional

extra param values

-
-
-
-
Returns
-
-
dict

merged param map

-
-
-
-
-
- -
-
-getOrDefault(param)
-

Gets the value of a param in the user-supplied param map or its -default value. Raises an error if neither is set.

-
- -
-
-getParam(paramName)
-

Gets a param by its name.

-
- -
-
-getParamValue(paramName)
-

Gets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-hasDefault(param)
-

Checks whether a param has a default value.

-
- -
-
-hasParam(paramName)
-

Tests whether this instance contains a param with a given -(string) name.

-
- -
-
-isDefined(param)
-

Checks whether a param is explicitly set by user or has -a default value.

-
- -
-
-isSet(param)
-

Checks whether a param is explicitly set by user.

-
- -
-
-classmethod load(path)
-

Reads an ML instance from the input path, a shortcut of read().load(path).

-
- -
-
-property params
-

Returns all params ordered by name. The default implementation -uses dir() to get all attributes of type -Param.

-
- -
-
-classmethod read()
-

Returns an MLReader instance for this class.

-
- -
-
-save(path)
-

Save this ML instance to the given path, a shortcut of ‘write().save(path)’.

-
- -
-
-set(param, value)
-

Sets a parameter in the embedded param map.

-
- -
-
-setInputCols(value)[source]
-

Sets input columns name.

-
-
Parameters
-
-
valuestr

Name of the input column

-
-
-
-
-
- -
-
-setOutputCol(value)[source]
-

Sets output column name.

-
-
Parameters
-
-
valuestr

Name of the Output Column

-
-
-
-
-
- -
-
-setParamValue(paramName)
-

Sets the value of a parameter.

-
-
Parameters
-
-
paramNamestr

Name of the parameter

-
-
-
-
-
- -
-
-transform(dataset, params=None)
-

Transforms the input dataset with optional parameters.

-
-

New in version 1.3.0.

-
-
-
Parameters
-
-
datasetpyspark.sql.DataFrame

input dataset

-
-
paramsdict, optional

an optional param map that overrides embedded params.

-
-
-
-
Returns
-
-
pyspark.sql.DataFrame

transformed dataset

-
-
-
-
-
- -
-
-uid
-

A unique id for the object.

-
- -
-
-write()
-

Returns an MLWriter instance for this ML instance.

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.base.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.base.html deleted file mode 100644 index 94f9481c8d..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.base.html +++ /dev/null @@ -1,590 +0,0 @@ - - - - - - - - sparknlp_jsl.base — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.base

-

Classes

- ---- - - - - - -

FeaturesAssembler

-
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.training.AnnotationToolJsonReader.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.training.AnnotationToolJsonReader.html deleted file mode 100644 index 93fdf4d099..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.training.AnnotationToolJsonReader.html +++ /dev/null @@ -1,642 +0,0 @@ - - - - - - - - sparknlp_jsl.training.AnnotationToolJsonReader — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.training.AnnotationToolJsonReader

-
-
-class sparknlp_jsl.training.AnnotationToolJsonReader(pipeline_model=None, assertion_labels=None, excluded_labels=None, cleanup_mode='disabled', split_chars=None, context_chars=None, scheme='IOB', min_chars_tol=2, align_chars_tol=1, merge_overlapping=True, SDDLPath='')[source]
-

Bases: sparknlp.internal.ExtendedJavaWrapper

-

Class to This a reader that generate a assertion train set from the json from annotations labs exports.

-

Examples

-
>>> from sparknlp_jsl.training import AnnotationToolJsonReader
->>> assertion_labels = ["AsPresent","Absent"]
->>> excluded_labels = ["Treatment"]
->>> split_chars = [" ", "\-"]
->>> context_chars = [".", ",", ";"]
->>> SDDLPath = ""
->>> rdr = AnnotationToolJsonReader(assertion_labels = assertion_labels, excluded_labels = excluded_labels, split_chars = split_chars, context_chars = context_chars,SDDLPath=SDDLPath)
->>> path = "src/test/resources/anc-pos-corpus-small/test-training.txt"
->>> df = rdr.readDataset(spark, json_path)
->>> assertion_df = rdr.generateAssertionTrainSet(df)
->>> assertion_df.show()
-
-
-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - -

__init__([pipeline_model, assertion_labels, ...])

-
Attributes
-

-
-

apply()

generateAssertionTrainSet(df[, sentenceCol, ...])

new_java_array(pylist, java_class)

ToDo: Inspired from spark 2.0.

new_java_array_integer(pylist)

new_java_array_string(pylist)

new_java_obj(java_class, *args)

readDataset(spark, path)

-
-
-new_java_array(pylist, java_class)
-

ToDo: Inspired from spark 2.0. Review if spark changes

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.training.CantemistReader.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.training.CantemistReader.html deleted file mode 100644 index c99d60640a..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.training.CantemistReader.html +++ /dev/null @@ -1,620 +0,0 @@ - - - - - - - - sparknlp_jsl.training.CantemistReader — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.training.CantemistReader

-
-
-class sparknlp_jsl.training.CantemistReader(scheme='IOB')[source]
-

Bases: sparknlp.internal.ExtendedJavaWrapper

-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - -

__init__([scheme])

apply()

new_java_array(pylist, java_class)

ToDo: Inspired from spark 2.0.

new_java_array_integer(pylist)

new_java_array_string(pylist)

new_java_obj(java_class, *args)

readDatasetTaskNer(spark, textFolder)

-
-
-new_java_array(pylist, java_class)
-

ToDo: Inspired from spark 2.0. Review if spark changes

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.training.CodiEspReader.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.training.CodiEspReader.html deleted file mode 100644 index 5b47257d0c..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.training.CodiEspReader.html +++ /dev/null @@ -1,620 +0,0 @@ - - - - - - - - sparknlp_jsl.training.CodiEspReader — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.training.CodiEspReader

-
-
-class sparknlp_jsl.training.CodiEspReader(scheme='IOB')[source]
-

Bases: sparknlp.internal.ExtendedJavaWrapper

-

Methods

- ---- - - - - - - - - - - - - - - - - - - - - - - - -

__init__([scheme])

apply()

new_java_array(pylist, java_class)

ToDo: Inspired from spark 2.0.

new_java_array_integer(pylist)

new_java_array_string(pylist)

new_java_obj(java_class, *args)

readDatasetTaskX(spark, path, textFolder[, sep])

-
-
-new_java_array(pylist, java_class)
-

ToDo: Inspired from spark 2.0. Review if spark changes

-
- -
- -
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.training.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.training.html deleted file mode 100644 index 3ecded3df9..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.training.html +++ /dev/null @@ -1,596 +0,0 @@ - - - - - - - - sparknlp_jsl.training — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.training

-

Classes

- ---- - - - - - - - - - - - -

AnnotationToolJsonReader

Class to This a reader that generate a assertion train set from the json from annotations labs exports.

CantemistReader

CodiEspReader

-
- - -
- - - - -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.training.tf_graph.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.training.tf_graph.html deleted file mode 100644 index 1eb7d57da9..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.training.tf_graph.html +++ /dev/null @@ -1,188 +0,0 @@ - - - - - - - - sparknlp_jsl.training.tf_graph — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.training.tf_graph

-

Factory class to create the the different tensorflow graphs for ner_dl, generic_classifier, assertion_dl, relation_extraction annotators in spark-nlp healthcare

-
- - -
- - -
- - -
- -
- - -
-
- - - -
-
- - - - - -
-
- - \ No newline at end of file diff --git a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.training.tf_graph_1x.html b/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.training.tf_graph_1x.html deleted file mode 100644 index 59ac48a769..0000000000 --- a/docs/licensed/api/python/reference/autosummary/_autosummary/sparknlp_jsl.training.tf_graph_1x.html +++ /dev/null @@ -1,188 +0,0 @@ - - - - - - - - sparknlp_jsl.training.tf_graph_1x — Spark NLP 3.3.0 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
- - - - -
- - -
- - - -
- -
- -
- - -
- - - - - - -
- -
- -
-

sparknlp_jsl.training.tf_graph_1x

-

Factory class to create the the different tensorflow graphs for ner_dl, generic_classifier, assertion_dl, relation_extraction annotators in spark-nlp healthcare

-
- - -
- - -
- - -
- -
- - -
-
- - - - - - \ No newline at end of file