Merge pull request #67 from hynky1999/docs

docs
hynky1999 · May 12, 2023 · 369eed6 · 369eed6
2 parents 63d4cce + 5a066a3
commit 369eed6
Show file tree

Hide file tree

Showing 588 changed files with 118,832 additions and 0 deletions.
diff --git a/docs/build/doctrees/api.doctree b/docs/build/doctrees/api.doctree
diff --git a/docs/build/doctrees/cli/cli.doctree b/docs/build/doctrees/cli/cli.doctree
diff --git a/docs/build/doctrees/cli/download.doctree b/docs/build/doctrees/cli/download.doctree
diff --git a/docs/build/doctrees/cli/extract.doctree b/docs/build/doctrees/cli/extract.doctree
diff --git a/docs/build/doctrees/cli/index.doctree b/docs/build/doctrees/cli/index.doctree
diff --git a/docs/build/doctrees/environment.pickle b/docs/build/doctrees/environment.pickle
diff --git a/docs/build/doctrees/extraction/config_file.doctree b/docs/build/doctrees/extraction/config_file.doctree
diff --git a/docs/build/doctrees/extraction/creating_extractor.doctree b/docs/build/doctrees/extraction/creating_extractor.doctree
diff --git a/docs/build/doctrees/extraction/index.doctree b/docs/build/doctrees/extraction/index.doctree
diff --git a/docs/build/doctrees/extraction/utils.doctree b/docs/build/doctrees/extraction/utils.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.aggregator.doctree b/docs/build/doctrees/generated/cmoncrawl.aggregator.doctree
diff --git a/...uild/doctrees/generated/cmoncrawl.aggregator.index_query.IndexAggregator.__init__.doctree b/...uild/doctrees/generated/cmoncrawl.aggregator.index_query.IndexAggregator.__init__.doctree
diff --git a/.../build/doctrees/generated/cmoncrawl.aggregator.index_query.IndexAggregator.aclose.doctree b/.../build/doctrees/generated/cmoncrawl.aggregator.index_query.IndexAggregator.aclose.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.aggregator.index_query.IndexAggregator.aopen.doctree b/docs/build/doctrees/generated/cmoncrawl.aggregator.index_query.IndexAggregator.aopen.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.aggregator.index_query.IndexAggregator.doctree b/docs/build/doctrees/generated/cmoncrawl.aggregator.index_query.IndexAggregator.doctree
diff --git a/...ees/generated/cmoncrawl.aggregator.index_query.IndexAggregator.get_all_CC_indexes.doctree b/...ees/generated/cmoncrawl.aggregator.index_query.IndexAggregator.get_all_CC_indexes.doctree
diff --git a/...generated/cmoncrawl.aggregator.index_query.IndexAggregator.get_captured_responses.doctree b/...generated/cmoncrawl.aggregator.index_query.IndexAggregator.get_captured_responses.doctree
diff --git a/...es/generated/cmoncrawl.aggregator.index_query.IndexAggregator.get_number_of_pages.doctree b/...es/generated/cmoncrawl.aggregator.index_query.IndexAggregator.get_number_of_pages.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.aggregator.index_query.doctree b/docs/build/doctrees/generated/cmoncrawl.aggregator.index_query.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.aggregator.utils.doctree b/docs/build/doctrees/generated/cmoncrawl.aggregator.utils.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.aggregator.utils.helpers.doctree b/docs/build/doctrees/generated/cmoncrawl.aggregator.utils.helpers.doctree
diff --git a/...ild/doctrees/generated/cmoncrawl.aggregator.utils.ndjson_decoder.Decoder.__init__.doctree b/...ild/doctrees/generated/cmoncrawl.aggregator.utils.ndjson_decoder.Decoder.__init__.doctree
diff --git a/...build/doctrees/generated/cmoncrawl.aggregator.utils.ndjson_decoder.Decoder.decode.doctree b/...build/doctrees/generated/cmoncrawl.aggregator.utils.ndjson_decoder.Decoder.decode.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.aggregator.utils.ndjson_decoder.Decoder.doctree b/docs/build/doctrees/generated/cmoncrawl.aggregator.utils.ndjson_decoder.Decoder.doctree
diff --git a/...d/doctrees/generated/cmoncrawl.aggregator.utils.ndjson_decoder.Decoder.raw_decode.doctree b/...d/doctrees/generated/cmoncrawl.aggregator.utils.ndjson_decoder.Decoder.raw_decode.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.aggregator.utils.ndjson_decoder.doctree b/docs/build/doctrees/generated/cmoncrawl.aggregator.utils.ndjson_decoder.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.common.doctree b/docs/build/doctrees/generated/cmoncrawl.common.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.common.loggers.doctree b/docs/build/doctrees/generated/cmoncrawl.common.loggers.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.common.types.DomainCrawl.__init__.doctree b/docs/build/doctrees/generated/cmoncrawl.common.types.DomainCrawl.__init__.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.common.types.DomainCrawl.doctree b/docs/build/doctrees/generated/cmoncrawl.common.types.DomainCrawl.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.common.types.DomainRecord.__init__.doctree b/docs/build/doctrees/generated/cmoncrawl.common.types.DomainRecord.__init__.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.common.types.DomainRecord.doctree b/docs/build/doctrees/generated/cmoncrawl.common.types.DomainRecord.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.common.types.DomainRecord.from_dict.doctree b/docs/build/doctrees/generated/cmoncrawl.common.types.DomainRecord.from_dict.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.common.types.DomainRecord.from_json.doctree b/docs/build/doctrees/generated/cmoncrawl.common.types.DomainRecord.from_json.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.common.types.DomainRecord.schema.doctree b/docs/build/doctrees/generated/cmoncrawl.common.types.DomainRecord.schema.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.common.types.DomainRecord.to_dict.doctree b/docs/build/doctrees/generated/cmoncrawl.common.types.DomainRecord.to_dict.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.common.types.DomainRecord.to_json.doctree b/docs/build/doctrees/generated/cmoncrawl.common.types.DomainRecord.to_json.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.common.types.ExtractConfig.__init__.doctree b/docs/build/doctrees/generated/cmoncrawl.common.types.ExtractConfig.__init__.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.common.types.ExtractConfig.doctree b/docs/build/doctrees/generated/cmoncrawl.common.types.ExtractConfig.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.common.types.ExtractConfig.from_dict.doctree b/docs/build/doctrees/generated/cmoncrawl.common.types.ExtractConfig.from_dict.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.common.types.ExtractConfig.from_json.doctree b/docs/build/doctrees/generated/cmoncrawl.common.types.ExtractConfig.from_json.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.common.types.ExtractConfig.schema.doctree b/docs/build/doctrees/generated/cmoncrawl.common.types.ExtractConfig.schema.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.common.types.ExtractConfig.to_dict.doctree b/docs/build/doctrees/generated/cmoncrawl.common.types.ExtractConfig.to_dict.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.common.types.ExtractConfig.to_json.doctree b/docs/build/doctrees/generated/cmoncrawl.common.types.ExtractConfig.to_json.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.common.types.ExtractorConfig.__init__.doctree b/docs/build/doctrees/generated/cmoncrawl.common.types.ExtractorConfig.__init__.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.common.types.ExtractorConfig.doctree b/docs/build/doctrees/generated/cmoncrawl.common.types.ExtractorConfig.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.common.types.ExtractorConfig.from_dict.doctree b/docs/build/doctrees/generated/cmoncrawl.common.types.ExtractorConfig.from_dict.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.common.types.ExtractorConfig.from_json.doctree b/docs/build/doctrees/generated/cmoncrawl.common.types.ExtractorConfig.from_json.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.common.types.ExtractorConfig.schema.doctree b/docs/build/doctrees/generated/cmoncrawl.common.types.ExtractorConfig.schema.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.common.types.ExtractorConfig.to_dict.doctree b/docs/build/doctrees/generated/cmoncrawl.common.types.ExtractorConfig.to_dict.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.common.types.ExtractorConfig.to_json.doctree b/docs/build/doctrees/generated/cmoncrawl.common.types.ExtractorConfig.to_json.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.common.types.PipeMetadata.__init__.doctree b/docs/build/doctrees/generated/cmoncrawl.common.types.PipeMetadata.__init__.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.common.types.PipeMetadata.doctree b/docs/build/doctrees/generated/cmoncrawl.common.types.PipeMetadata.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.common.types.RetrieveResponse.__init__.doctree b/docs/build/doctrees/generated/cmoncrawl.common.types.RetrieveResponse.__init__.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.common.types.RetrieveResponse.doctree b/docs/build/doctrees/generated/cmoncrawl.common.types.RetrieveResponse.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.common.types.RoutesConfig.__init__.doctree b/docs/build/doctrees/generated/cmoncrawl.common.types.RoutesConfig.__init__.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.common.types.RoutesConfig.doctree b/docs/build/doctrees/generated/cmoncrawl.common.types.RoutesConfig.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.common.types.RoutesConfig.from_dict.doctree b/docs/build/doctrees/generated/cmoncrawl.common.types.RoutesConfig.from_dict.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.common.types.RoutesConfig.from_json.doctree b/docs/build/doctrees/generated/cmoncrawl.common.types.RoutesConfig.from_json.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.common.types.RoutesConfig.schema.doctree b/docs/build/doctrees/generated/cmoncrawl.common.types.RoutesConfig.schema.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.common.types.RoutesConfig.to_dict.doctree b/docs/build/doctrees/generated/cmoncrawl.common.types.RoutesConfig.to_dict.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.common.types.RoutesConfig.to_json.doctree b/docs/build/doctrees/generated/cmoncrawl.common.types.RoutesConfig.to_json.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.common.types.doctree b/docs/build/doctrees/generated/cmoncrawl.common.types.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.doctree b/docs/build/doctrees/generated/cmoncrawl.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.processor.doctree b/docs/build/doctrees/generated/cmoncrawl.processor.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.processor.extraction.doctree b/docs/build/doctrees/generated/cmoncrawl.processor.extraction.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.processor.extraction.filters.doctree b/docs/build/doctrees/generated/cmoncrawl.processor.extraction.filters.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.processor.extraction.utils.doctree b/docs/build/doctrees/generated/cmoncrawl.processor.extraction.utils.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.doctree b/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.doctree
diff --git a/...ctrees/generated/cmoncrawl.processor.pipeline.downloader.AsyncDownloader.__init__.doctree b/...ctrees/generated/cmoncrawl.processor.pipeline.downloader.AsyncDownloader.__init__.doctree
diff --git a/...doctrees/generated/cmoncrawl.processor.pipeline.downloader.AsyncDownloader.aclose.doctree b/...doctrees/generated/cmoncrawl.processor.pipeline.downloader.AsyncDownloader.aclose.doctree
diff --git a/.../doctrees/generated/cmoncrawl.processor.pipeline.downloader.AsyncDownloader.aopen.doctree b/.../doctrees/generated/cmoncrawl.processor.pipeline.downloader.AsyncDownloader.aopen.doctree
diff --git a/.../build/doctrees/generated/cmoncrawl.processor.pipeline.downloader.AsyncDownloader.doctree b/.../build/doctrees/generated/cmoncrawl.processor.pipeline.downloader.AsyncDownloader.doctree
diff --git a/...ctrees/generated/cmoncrawl.processor.pipeline.downloader.AsyncDownloader.download.doctree b/...ctrees/generated/cmoncrawl.processor.pipeline.downloader.AsyncDownloader.download.doctree
diff --git a/...doctrees/generated/cmoncrawl.processor.pipeline.downloader.AsyncDownloader.unwrap.doctree b/...doctrees/generated/cmoncrawl.processor.pipeline.downloader.AsyncDownloader.unwrap.doctree
diff --git a/...ctrees/generated/cmoncrawl.processor.pipeline.downloader.DownloaderDummy.__init__.doctree b/...ctrees/generated/cmoncrawl.processor.pipeline.downloader.DownloaderDummy.__init__.doctree
diff --git a/.../build/doctrees/generated/cmoncrawl.processor.pipeline.downloader.DownloaderDummy.doctree b/.../build/doctrees/generated/cmoncrawl.processor.pipeline.downloader.DownloaderDummy.doctree
diff --git a/...ctrees/generated/cmoncrawl.processor.pipeline.downloader.DownloaderDummy.download.doctree b/...ctrees/generated/cmoncrawl.processor.pipeline.downloader.DownloaderDummy.download.doctree
diff --git a/...ees/generated/cmoncrawl.processor.pipeline.downloader.DownloaderDummy.extract_url.doctree b/...ees/generated/cmoncrawl.processor.pipeline.downloader.DownloaderDummy.extract_url.doctree
diff --git a/...es/generated/cmoncrawl.processor.pipeline.downloader.DownloaderDummy.extract_year.doctree b/...es/generated/cmoncrawl.processor.pipeline.downloader.DownloaderDummy.extract_year.doctree
diff --git a/...s/generated/cmoncrawl.processor.pipeline.downloader.DownloaderDummy.mine_metadata.doctree b/...s/generated/cmoncrawl.processor.pipeline.downloader.DownloaderDummy.mine_metadata.doctree
diff --git a/...d/doctrees/generated/cmoncrawl.processor.pipeline.downloader.IDownloader.__init__.doctree b/...d/doctrees/generated/cmoncrawl.processor.pipeline.downloader.IDownloader.__init__.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.downloader.IDownloader.doctree b/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.downloader.IDownloader.doctree
diff --git a/...d/doctrees/generated/cmoncrawl.processor.pipeline.downloader.IDownloader.download.doctree b/...d/doctrees/generated/cmoncrawl.processor.pipeline.downloader.IDownloader.download.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.downloader.doctree b/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.downloader.doctree
diff --git a/.../doctrees/generated/cmoncrawl.processor.pipeline.extractor.BaseExtractor.__init__.doctree b/.../doctrees/generated/cmoncrawl.processor.pipeline.extractor.BaseExtractor.__init__.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.extractor.BaseExtractor.doctree b/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.extractor.BaseExtractor.doctree
diff --git a/...d/doctrees/generated/cmoncrawl.processor.pipeline.extractor.BaseExtractor.extract.doctree b/...d/doctrees/generated/cmoncrawl.processor.pipeline.extractor.BaseExtractor.extract.doctree
diff --git a/...trees/generated/cmoncrawl.processor.pipeline.extractor.BaseExtractor.extract_soup.doctree b/...trees/generated/cmoncrawl.processor.pipeline.extractor.BaseExtractor.extract_soup.doctree
diff --git a/...octrees/generated/cmoncrawl.processor.pipeline.extractor.BaseExtractor.filter_raw.doctree b/...octrees/generated/cmoncrawl.processor.pipeline.extractor.BaseExtractor.filter_raw.doctree
diff --git a/...ctrees/generated/cmoncrawl.processor.pipeline.extractor.BaseExtractor.filter_soup.doctree b/...ctrees/generated/cmoncrawl.processor.pipeline.extractor.BaseExtractor.filter_soup.doctree
diff --git a/...octrees/generated/cmoncrawl.processor.pipeline.extractor.BaseExtractor.preprocess.doctree b/...octrees/generated/cmoncrawl.processor.pipeline.extractor.BaseExtractor.preprocess.doctree
diff --git a/...s/generated/cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.__init__.doctree b/...s/generated/cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.__init__.doctree
diff --git a/...d/doctrees/generated/cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.doctree b/...d/doctrees/generated/cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.doctree
diff --git a/...es/generated/cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.extract.doctree b/...es/generated/cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.extract.doctree
diff --git a/...nerated/cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.extract_soup.doctree b/...nerated/cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.extract_soup.doctree
diff --git a/...generated/cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.filter_raw.doctree b/...generated/cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.filter_raw.doctree
diff --git a/...enerated/cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.filter_soup.doctree b/...enerated/cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.filter_soup.doctree
diff --git a/...generated/cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.preprocess.doctree b/...generated/cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.preprocess.doctree
diff --git a/.../doctrees/generated/cmoncrawl.processor.pipeline.extractor.HTMLExtractor.__init__.doctree b/.../doctrees/generated/cmoncrawl.processor.pipeline.extractor.HTMLExtractor.__init__.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.extractor.HTMLExtractor.doctree b/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.extractor.HTMLExtractor.doctree
diff --git a/...d/doctrees/generated/cmoncrawl.processor.pipeline.extractor.HTMLExtractor.extract.doctree b/...d/doctrees/generated/cmoncrawl.processor.pipeline.extractor.HTMLExtractor.extract.doctree
diff --git a/...trees/generated/cmoncrawl.processor.pipeline.extractor.HTMLExtractor.extract_soup.doctree b/...trees/generated/cmoncrawl.processor.pipeline.extractor.HTMLExtractor.extract_soup.doctree
diff --git a/...octrees/generated/cmoncrawl.processor.pipeline.extractor.HTMLExtractor.filter_raw.doctree b/...octrees/generated/cmoncrawl.processor.pipeline.extractor.HTMLExtractor.filter_raw.doctree
diff --git a/...ctrees/generated/cmoncrawl.processor.pipeline.extractor.HTMLExtractor.filter_soup.doctree b/...ctrees/generated/cmoncrawl.processor.pipeline.extractor.HTMLExtractor.filter_soup.doctree
diff --git a/...octrees/generated/cmoncrawl.processor.pipeline.extractor.HTMLExtractor.preprocess.doctree b/...octrees/generated/cmoncrawl.processor.pipeline.extractor.HTMLExtractor.preprocess.doctree
diff --git a/...ild/doctrees/generated/cmoncrawl.processor.pipeline.extractor.IExtractor.__init__.doctree b/...ild/doctrees/generated/cmoncrawl.processor.pipeline.extractor.IExtractor.__init__.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.extractor.IExtractor.doctree b/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.extractor.IExtractor.doctree
diff --git a/...uild/doctrees/generated/cmoncrawl.processor.pipeline.extractor.IExtractor.extract.doctree b/...uild/doctrees/generated/cmoncrawl.processor.pipeline.extractor.IExtractor.extract.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.extractor.doctree b/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.extractor.doctree
diff --git a/...ctrees/generated/cmoncrawl.processor.pipeline.pipeline.ProcessorPipeline.__init__.doctree b/...ctrees/generated/cmoncrawl.processor.pipeline.pipeline.ProcessorPipeline.__init__.doctree
diff --git a/.../build/doctrees/generated/cmoncrawl.processor.pipeline.pipeline.ProcessorPipeline.doctree b/.../build/doctrees/generated/cmoncrawl.processor.pipeline.pipeline.ProcessorPipeline.doctree
diff --git a/...ted/cmoncrawl.processor.pipeline.pipeline.ProcessorPipeline.process_domain_record.doctree b/...ted/cmoncrawl.processor.pipeline.pipeline.ProcessorPipeline.process_domain_record.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.pipeline.doctree b/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.pipeline.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.router.IRouter.__init__.doctree b/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.router.IRouter.__init__.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.router.IRouter.doctree b/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.router.IRouter.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.router.IRouter.route.doctree b/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.router.IRouter.route.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.router.Route.__init__.doctree b/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.router.Route.__init__.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.router.Route.doctree b/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.router.Route.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.router.Router.__init__.doctree b/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.router.Router.__init__.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.router.Router.doctree b/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.router.Router.doctree
diff --git a/...uild/doctrees/generated/cmoncrawl.processor.pipeline.router.Router.load_extractor.doctree b/...uild/doctrees/generated/cmoncrawl.processor.pipeline.router.Router.load_extractor.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.router.Router.load_module.doctree b/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.router.Router.load_module.doctree
diff --git a/...ees/generated/cmoncrawl.processor.pipeline.router.Router.load_module_as_extractor.doctree b/...ees/generated/cmoncrawl.processor.pipeline.router.Router.load_module_as_extractor.doctree
diff --git a/.../build/doctrees/generated/cmoncrawl.processor.pipeline.router.Router.load_modules.doctree b/.../build/doctrees/generated/cmoncrawl.processor.pipeline.router.Router.load_modules.doctree
diff --git a/...uild/doctrees/generated/cmoncrawl.processor.pipeline.router.Router.register_route.doctree b/...uild/doctrees/generated/cmoncrawl.processor.pipeline.router.Router.register_route.doctree
diff --git a/...ild/doctrees/generated/cmoncrawl.processor.pipeline.router.Router.register_routes.doctree b/...ild/doctrees/generated/cmoncrawl.processor.pipeline.router.Router.register_routes.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.router.Router.route.doctree b/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.router.Router.route.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.router.doctree b/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.router.doctree
diff --git a/...octrees/generated/cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.__init__.doctree b/...octrees/generated/cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.__init__.doctree
diff --git a/...octrees/generated/cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.clean_up.doctree b/...octrees/generated/cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.clean_up.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.doctree b/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.doctree
diff --git a/...es/generated/cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.get_file_name.doctree b/...es/generated/cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.get_file_name.doctree
diff --git a/...nerated/cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.metadata_to_string.doctree b/...nerated/cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.metadata_to_string.doctree
diff --git a/.../doctrees/generated/cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.stream.doctree b/.../doctrees/generated/cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.stream.doctree
diff --git a/...build/doctrees/generated/cmoncrawl.processor.pipeline.streamer.IStreamer.__init__.doctree b/...build/doctrees/generated/cmoncrawl.processor.pipeline.streamer.IStreamer.__init__.doctree
diff --git a/...build/doctrees/generated/cmoncrawl.processor.pipeline.streamer.IStreamer.clean_up.doctree b/...build/doctrees/generated/cmoncrawl.processor.pipeline.streamer.IStreamer.clean_up.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.streamer.IStreamer.doctree b/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.streamer.IStreamer.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.streamer.IStreamer.stream.doctree b/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.streamer.IStreamer.stream.doctree
diff --git a/...d/doctrees/generated/cmoncrawl.processor.pipeline.streamer.StreamerDummy.__init__.doctree b/...d/doctrees/generated/cmoncrawl.processor.pipeline.streamer.StreamerDummy.__init__.doctree
diff --git a/...d/doctrees/generated/cmoncrawl.processor.pipeline.streamer.StreamerDummy.clean_up.doctree b/...d/doctrees/generated/cmoncrawl.processor.pipeline.streamer.StreamerDummy.clean_up.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.streamer.StreamerDummy.doctree b/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.streamer.StreamerDummy.doctree
diff --git a/...ild/doctrees/generated/cmoncrawl.processor.pipeline.streamer.StreamerDummy.stream.doctree b/...ild/doctrees/generated/cmoncrawl.processor.pipeline.streamer.StreamerDummy.stream.doctree
diff --git a/...octrees/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.__init__.doctree b/...octrees/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.__init__.doctree
diff --git a/...octrees/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.clean_up.doctree b/...octrees/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.clean_up.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.doctree b/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.doctree
diff --git a/...es/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.get_file_name.doctree b/...es/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.get_file_name.doctree
diff --git a/...nerated/cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.metadata_to_string.doctree b/...nerated/cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.metadata_to_string.doctree
diff --git a/.../doctrees/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.stream.doctree b/.../doctrees/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.stream.doctree
diff --git a/...octrees/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.__init__.doctree b/...octrees/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.__init__.doctree
diff --git a/...octrees/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.clean_up.doctree b/...octrees/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.clean_up.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.doctree b/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.doctree
diff --git a/...es/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.get_file_name.doctree b/...es/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.get_file_name.doctree
diff --git a/...nerated/cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.metadata_to_string.doctree b/...nerated/cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.metadata_to_string.doctree
diff --git a/.../doctrees/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.stream.doctree b/.../doctrees/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.stream.doctree
diff --git a/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.streamer.doctree b/docs/build/doctrees/generated/cmoncrawl.processor.pipeline.streamer.doctree
diff --git a/docs/build/doctrees/index.doctree b/docs/build/doctrees/index.doctree
diff --git a/docs/build/doctrees/misc/domain_record.doctree b/docs/build/doctrees/misc/domain_record.doctree
diff --git a/docs/build/doctrees/misc/index.doctree b/docs/build/doctrees/misc/index.doctree
diff --git a/docs/build/doctrees/prog_guide/index.doctree b/docs/build/doctrees/prog_guide/index.doctree
diff --git a/docs/build/doctrees/prog_guide/overview.doctree b/docs/build/doctrees/prog_guide/overview.doctree
diff --git a/docs/build/doctrees/prog_guide/pip.doctree b/docs/build/doctrees/prog_guide/pip.doctree
diff --git a/docs/build/doctrees/usage.doctree b/docs/build/doctrees/usage.doctree
diff --git a/docs/build/html/.buildinfo b/docs/build/html/.buildinfo
@@ -0,0 +1,4 @@
+# Sphinx build info version 1
+# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
+config: aa56b11fc3400b25742e9c52f456c98e
+tags: 645f666f9bcd5a90fca523b33c5a78b7
diff --git a/docs/build/html/_sources/api.rst.txt b/docs/build/html/_sources/api.rst.txt
@@ -0,0 +1,14 @@
+API
+===
+
+.. autosummary::
+    :recursive:
+    :toctree: generated
+
+
+    cmoncrawl
+
+
+
+
+
diff --git a/docs/build/html/_sources/cli/cli.rst.txt b/docs/build/html/_sources/cli/cli.rst.txt
@@ -0,0 +1,41 @@
+.. _cli:
+
+Command Line Interface
+======================
+
+The command line interface is a simple wrapper around the library.
+
+It provides the two main functionalities:
+
+* `download` - Downloads samples of either :ref:`domain_record` or HTML from common crawl indexes
+* `extract` - Downloads an HTML from Domain Record and extracts the content. It can also directly take the HTML and extract the data.
+
+Both functionalities are invoked using ```cmon``` followed by the functionality and the required arguments.
+
+Examples
+--------
+
+.. code-block:: bash
+
+    # Download first 1000 domain records for example.com
+    cmon download --match_type=domain --limit=1000 example.com dr_output record
+
+    # Download first 100 htmls for example.com
+    cmon download --match_type=domain --limit=100 example.com html_output html
+
+    # Take the domain records downloaded using the first command and extracts them using your extractors
+    cmon extract config.json extracted_output dr_output/*/*.jsonl record
+
+    # Take the htmls downloaded using the second command and extracts them using your extractors
+    cmon extract config.json extracted_output html_output/*/*.html html
+
+
+
+
+
+
+
+
+
+
+
diff --git a/docs/build/html/_sources/cli/download.rst.txt b/docs/build/html/_sources/cli/download.rst.txt
@@ -0,0 +1,74 @@
+Command Line Download
+=====================
+
+The download mode of the ```cmon``` command line tool servers to query and download from CommonCrawl indexes.
+The following arguments are needed in this order:
+
+Positional arguments
+--------------------
+
+1. url - URL to query.
+
+2. output - Path to output directory.
+
+3. {record,html} - Download mode:
+
+   - record: Download record files from Common Crawl.
+   - html: Download HTML files from Common Crawl.
+
+
+In html mode, the output directory will contain .html files, one
+for each found URL. In record mode, the output directory will contain
+```.jsonl``` files, each containing multiple domain records in JSON format.
+
+
+Options
+-------
+
+--limit LIMIT
+   Max number of URLs to download.
+
+--since SINCE
+   Start date in ISO format (e.g., 2020-01-01).
+
+--to TO
+   End date in ISO format (e.g., 2020-01-01).
+
+--cc_server CC_SERVER
+   Common Crawl indexes to query. Must provide the whole URL (e.g., https://index.commoncrawl.org/CC-MAIN-2023-14-index).
+
+--max_retry MAX_RETRY
+   Max number of retries for a request. Increase this number when requests are failing.
+
+--sleep_step SLEEP_STEP
+   Number of additional seconds to add to the sleep time between each failed download attempt. Increase this number if the server tells you to slow down.
+
+--match_type MATCH_TYPE
+   One of exact, prefix, host, domain
+   Match type for the URL. Refer to cdx-api for more information.
+
+--max_directory_size MAX_DIRECTORY_SIZE
+   Max number of files per directory.
+
+--filter_non_200
+   Filter out non-200 status code.
+
+Record mode options
+-------------------
+
+--max_crawls_per_file MAX_CRAWLS_PER_FILE
+    Max number of domain records per file output
+
+
+
+Examples
+--------
+
+
+.. code-block:: bash
+
+    # Download first 1000 domain records for example.com
+    cmon download --match_type=domain --limit=1000 example.com dr_output record
+
+    # Download first 100 htmls for example.com
+    cmon download --match_type=domain --limit=100 example.com html_output html
diff --git a/docs/build/html/_sources/cli/extract.rst.txt b/docs/build/html/_sources/cli/extract.rst.txt
@@ -0,0 +1,84 @@
+Command line Extract
+====================
+
+The extract mode of the ```cmon``` command line tool servers to extract your download files.
+The following arguments are needed in this order:
+
+Positional arguments
+--------------------
+
+
+1. config_path - Path to config file containing extraction rules.
+
+2. output_path - Path to output directory.
+
+3. files - Files to extract data from.
+
+4. {record,html} - Extraction mode:
+
+   - record: Extract data from jsonl (domain record) files.
+   - html: Extract data from HTML files.
+
+To create a config file, see :ref:`extractor_config`.
+
+Both modes yield the same output format, which is a ```.jsonl``` file containing the extracted data,
+one per line. For each file a new directory is created in the output directory, named after the
+file.
+
+The files created by the download mode, can be directly used with appropriate mode
+in the extraction. If you have an html file, you can use the html mode to extract it.
+If you have a domain records, which you got some other way (AWS Athena), please refer to :ref:`domain_record_jsonl`,
+which describes how to create ```.jsonl``` files from your domain records, which you can then
+use with the record mode.
+
+
+
+
+
+Optional arguments
+------------------
+
+--max_crawls_per_file MAX_CRAWLS_PER_FILE
+   Max number of extractions per file output.
+
+--max_directory_size MAX_DIRECTORY_SIZE
+   Max number of extraction files per directory.
+
+--n_proc N_PROC
+   Number of processes to use for extraction. The paralelization is on file level,
+   thus for single file it's useless to use more than one process.
+
+Record arguments
+----------------
+
+--max_retry MAX_RETRY
+   Max number of WARC download attempts.
+
+--sleep_step SLEEP_STEP
+   Number of additional seconds to add to the sleep time between each failed download attempt.
+
+Html arguments
+--------------
+
+--date DATE
+   Date of extraction of HTML files in ISO format (e.g., 2021-01-01). The default is today.
+
+--url URL
+   URL from which the HTML files were downloaded. By default, it will try to infer from the file content.
+
+
+Examples
+--------
+
+.. code-block:: bash
+
+    # Take the domain records downloaded using the first command and extracts them using your extractors
+    cmon extract config.json extracted_output dr_output/*/*.jsonl record --max_retry 100 --sleep_step 10
+
+    # Take the htmls downloaded using the second command and extracts them using your extractors
+    cmon extract config.json extracted_output html_output/*/*.html html --date 2021-01-01 --url https://www.example.com
+
+
+When you are going to build the extractors, you gonna appreaciate that you can specify
+what the url of the html file is and what the date of the extraction is. This is because 
+those information are used during the extractor routing.
diff --git a/docs/build/html/_sources/cli/index.rst.txt b/docs/build/html/_sources/cli/index.rst.txt
@@ -0,0 +1,12 @@
+Command Line Interface
+======================
+
+.. toctree::
+    :maxdepth: 2
+    :caption: Contents:
+
+    cli
+    download
+    extract
+
+
diff --git a/docs/build/html/_sources/extraction/config_file.rst.txt b/docs/build/html/_sources/extraction/config_file.rst.txt
@@ -0,0 +1,132 @@
+.. _extractor_config:
+
+Extractor config file
+==========================
+
+Structure
+---------
+
+In order to specify which extractor to use, you need to create a config
+The structure is following:
+
+.. code-block:: json
+
+    {
+
+        "extractors_path": "Path to the extractors folder",
+        "routes": [
+            {
+                "regexes": [".*"],
+                "extractors": [{
+                    "name": "my_extractor",
+                    "since": "iso date string",
+                    "to": "iso date string"
+                },
+                {
+                    "name": "my_extractor2",
+                }
+                ]
+            },
+            {
+                "regexes": ["another_regex"],
+                "....": "....
+            }
+        ]
+    }
+
+
+The ``extractors_path`` is the path to the folder where the extractors are located.
+
+.. note::
+    The extractors_path is relative to the current working directory.
+
+
+The ``routes`` is a list of routes. Each route is a dictionary with the following keys:
+
+* ``regexes``: a list of regexes. At least one regex must match the url, for this route to be used.
+* ``extractors``: a list of extractors that will be used to extract the data from the url.
+
+
+Each extractor has the following keys:
+
+* ``name``: the name of the extractor. This is the name of the python file without the .py extension, you can also set NAME variable in the extractor file to override this.
+* ``since`` [optional] : The starting crawl date for which the extractor is valid.  It must be full iso date string (e.g. 2009-01-01T00:00:00+00:00)
+* ``to`` [optional] : The ending crawl date for which the extractor is valid.  Format is the same as for ``since``.
+
+.. note::
+    If ``since`` and ``to`` are not specified, the extractor will be used for all crawls.
+
+
+Example
+-------
+
+Given the following folder structure:
+
+.. code-block:: text
+
+    extractors/
+    ├── a_extractor.py
+    ├── a_extractor2.py
+    └── b_extractor.py
+
+and the following config:
+
+.. code-block:: json
+
+    {
+
+        "extractors_path": "./extractors",
+        "routes": [
+            {
+                "regexes": [".*cmon.cz.*"],
+                "extractors": [{
+                    "name": "a_extractor",
+                    "to": "2010-01-01T00:00:00+00:00"
+                },
+                {
+                    "name": "a_extractor2",
+                    "since": "2010-01-01T00:00:00+00:00"
+                }
+                ]
+            },
+            {
+                "regexes": [".*cmon2.cz.*"],
+                "extractors": [{
+                    "name": "b_extractor",
+                }
+                ]
+            }
+        ]
+    }
+
+The following will happen:
+
+* A domain record with url http://www.cmon.cz, cralwed on 2012 will be extracted using the a_extractor2.py extractor.
+* A domain record with url http://www.cmon.cz, cralwed on 2009 will be extracted using the a_extractor.py extractor.
+* A domain record with url http://www.cmon2.cz, cralwed on 2012 will be extracted using the b_extractor.py extractor.
+
+
+`__init__.py`
+-------------
+You might want to put the common code of the extractors into
+a common python file. The problem is that during the execution,
+the extractors directory is not in the python path. To add the extractors
+directory we also load `__init__.py`` file (But don't add load extractors in it).
+
+Thus you can create `__init__.py` file in the extractors directory with the following content:
+
+.. code-block:: python
+
+    import sys
+    from pathlib import Path
+    sys.path.append(Path(__file__).parent)
+
+which will add the extractors directory to the python path.
+
+
+Arbitrary Code Execution
+------------------------
+.. warning::
+    Since the router, loads and executes all files in the extractors
+    directory, every .py file in this directory is executed. Thus
+    you should not put any untrusted files in this directory.