diff --git a/src/moldybread.nim b/src/moldybread.nim index c942691..cbaac97 100644 --- a/src/moldybread.nim +++ b/src/moldybread.nim @@ -239,6 +239,17 @@ when isMainModule: ## ## moldybread -o validate_checksums -n test -y /full/path/to/my/yaml/file.yml ## + ## Find Unique Datastreams + ## ======================= + ## + ## You can get a list of unique datastreams from objects across a result set. + ## + ## Example command: + ## + ## .. code-block:: sh + ## + ## moldybread -o find_distinct_datastreams -n test -y /full/path/to/my/yaml/file.yml + ## const banner = """ __ __ _ _ ____ _ | \/ | ___ | | __| |_ _ | __ ) _ __ ___ __ _ __| | @@ -250,7 +261,7 @@ when isMainModule: """ var p = newParser(fmt"Moldy Bread: See https://markpbaggett.github.io/moldybread/moldybread.html for documentation and examples on how to use this package.{'\n'}{'\n'}"): help(banner) - option("-o", "--operation", help="Specify operation", choices = @["harvest_datastream", "harvest_datastream_no_pages", "update_metadata", "update_metadata_and_delete_old_versions", "download_foxml", "version_datastream", "change_object_state", "purge_old_versions", "find_objs_missing_dsid", "get_datastream_history", "get_datastream_at_date", "validate_checksums"]) + option("-o", "--operation", help="Specify operation", choices = @["harvest_datastream", "harvest_datastream_no_pages", "update_metadata", "update_metadata_and_delete_old_versions", "download_foxml", "version_datastream", "change_object_state", "purge_old_versions", "find_objs_missing_dsid", "get_datastream_history", "get_datastream_at_date", "validate_checksums", "find_distinct_datastreams"]) option("-d", "--dsid", help="Specify datastream id.", default="") option("-n", "--namespaceorpid", help="Populate results based on namespace or PID.", default="") option("-dc", "--dcsearch", help="Populate results based on dc field and strings. See docs for formatting info.", default="") @@ -381,6 +392,13 @@ when isMainModule: echo fmt"{'\n'}{len(test.successes)} objects had valid checksums for their {opts.dsid} datastream. {len(test.errors)} objects had invalid checksums on their {opts.dsid} datastream." if len(test.errors) > 0: echo test.errors + of "find_distinct_datastreams": + if opts.namespaceorpid == "" and opts.dcsearch == "" and opts.terms == "": + echo "Must specify how you want to populated results: -p for Pid or Namespace, -dc for dc fields and strings, or -t for keyword terms." + else: + fedora_connection.results = fedora_connection.populate_results() + let result = fedora_connection.find_distinct_datastreams() + echo fmt"{'\n'}{'\n'}There are {len(result)} unique datastreams across this result set: {'\n'}{result}" of "update_metadata": if opts.path != "": yaml_settings.directory_path = opts.path diff --git a/src/moldybreadpkg/fedora.nim b/src/moldybreadpkg/fedora.nim index be3be81..c0ec0b8 100644 --- a/src/moldybreadpkg/fedora.nim +++ b/src/moldybreadpkg/fedora.nim @@ -709,3 +709,25 @@ method validate_checksums*(this: FedoraRequest): Message {. base .} = bar.increment() bar.finish() Message(errors: errors, successes: successes, attempts: attempts) + +method find_distinct_datastreams*(this: FedoraRequest): seq[string] {. base .} = + ## Filters distinct datastreams from all objects in a result set. + ## + ## Example: + ## + ## .. code-block:: nim + ## + ## let fedora_connection = initFedoraRequest(output_directory="/home/mark/nim_projects/moldybread/experiment", pid_part="test") + ## fedora_connection.results = fedora_connection.populate_results() + ## echo fedora_connection.find_distinct_datastreams() + ## + var bar = newProgressBar() + let datastream_report = this.get_datastreams() + echo "\n\nFiltering unique datastreams from result set.\n" + bar.start() + for i in 1..len(datastream_report): + for datastream in datastream_report[i-1][1]: + if datastream notin result: + result.add(datastream) + bar.increment() + bar.finish()