li{position:relative}.fa-li{left:-2em;position:absolute;text-align:center;width:2em;line-height:inherit}.fa-border{border:.08em solid #eee;border-radius:.1em;padding:.2em .25em .15em}.fa-pull-left{float:left}.fa-pull-right{float:right}.fa.fa-pull-left,.fab.fa-pull-left,.fal.fa-pull-left,.far.fa-pull-left,.fas.fa-pull-left{margin-right:.3em}.fa.fa-pull-right,.fab.fa-pull-right,.fal.fa-pull-right,.far.fa-pull-right,.fas.fa-pull-right{margin-left:.3em}.fa-spin{-webkit-animation:fa-spin 2s linear infinite;animation:fa-spin 2s linear infinite}.fa-pulse{-webkit-animation:fa-spin 1s steps(8) infinite;animation:fa-spin 1s steps(8) infinite}@-webkit-keyframes fa-spin{0%{-webkit-transform:rotate(0deg);transform:rotate(0deg)}to{-webkit-transform:rotate(1turn);transform:rotate(1turn)}}@keyframes fa-spin{0%{-webkit-transform:rotate(0deg);transform:rotate(0deg)}to{-webkit-transform:rotate(1turn);transform:rotate(1turn)}}.fa-rotate-90{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=1)";-webkit-transform:rotate(90deg);transform:rotate(90deg)}.fa-rotate-180{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=2)";-webkit-transform:rotate(180deg);transform:rotate(180deg)}.fa-rotate-270{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=3)";-webkit-transform:rotate(270deg);transform:rotate(270deg)}.fa-flip-horizontal{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=0, mirror=1)";-webkit-transform:scaleX(-1);transform:scaleX(-1)}.fa-flip-vertical{-webkit-transform:scaleY(-1);transform:scaleY(-1)}.fa-flip-both,.fa-flip-horizontal.fa-flip-vertical,.fa-flip-vertical{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=2, mirror=1)"}.fa-flip-both,.fa-flip-horizontal.fa-flip-vertical{-webkit-transform:scale(-1);transform:scale(-1)}:root .fa-flip-both,:root .fa-flip-horizontal,:root .fa-flip-vertical,:root .fa-rotate-90,:root .fa-rotate-180,:root .fa-rotate-270{-webkit-filter:none;filter:none}.fa-stack{display:inline-block;height:2em;line-height:2em;position:relative;vertical-align:middle;width:2.5em}.fa-stack-1x,.fa-stack-2x{left:0;position:absolute;text-align:center;width:100%}.fa-stack-1x{line-height:inherit}.fa-stack-2x{font-size:2em}.fa-inverse{color:#fff}.fa-500px:before{content:"\f26e"}.fa-accessible-icon:before{content:"\f368"}.fa-accusoft:before{content:"\f369"}.fa-acquisitions-incorporated:before{content:"\f6af"}.fa-ad:before{content:"\f641"}.fa-address-book:before{content:"\f2b9"}.fa-address-card:before{content:"\f2bb"}.fa-adjust:before{content:"\f042"}.fa-adn:before{content:"\f170"}.fa-adobe:before{content:"\f778"}.fa-adversal:before{content:"\f36a"}.fa-affiliatetheme:before{content:"\f36b"}.fa-air-freshener:before{content:"\f5d0"}.fa-airbnb:before{content:"\f834"}.fa-algolia:before{content:"\f36c"}.fa-align-center:before{content:"\f037"}.fa-align-justify:before{content:"\f039"}.fa-align-left:before{content:"\f036"}.fa-align-right:before{content:"\f038"}.fa-alipay:before{content:"\f642"}.fa-allergies:before{content:"\f461"}.fa-amazon:before{content:"\f270"}.fa-amazon-pay:before{content:"\f42c"}.fa-ambulance:before{content:"\f0f9"}.fa-american-sign-language-interpreting:before{content:"\f2a3"}.fa-amilia:before{content:"\f36d"}.fa-anchor:before{content:"\f13d"}.fa-android:before{content:"\f17b"}.fa-angellist:before{content:"\f209"}.fa-angle-double-down:before{content:"\f103"}.fa-angle-double-left:before{content:"\f100"}.fa-angle-double-right:before{content:"\f101"}.fa-angle-double-up:before{content:"\f102"}.fa-angle-down:before{content:"\f107"}.fa-angle-left:before{content:"\f104"}.fa-angle-right:before{content:"\f105"}.fa-angle-up:before{content:"\f106"}.fa-angry:before{content:"\f556"}.fa-angrycreative:before{content:"\f36e"}.fa-angular:before{content:"\f420"}.fa-ankh:before{content:"\f644"}.fa-app-store:before{content:"\f36f"}.fa-app-store-ios:before{content:"\f370"}.fa-apper:before{content:"\f371"}.fa-apple:before{content:"\f179"}.fa-apple-alt:before{content:"\f5d1"}.fa-apple-pay:before{content:"\f415"}.fa-archive:before{content:"\f187"}.fa-archway:before{content:"\f557"}.fa-arrow-alt-circle-down:before{content:"\f358"}.fa-arrow-alt-circle-left:before{content:"\f359"}.fa-arrow-alt-circle-right:before{content:"\f35a"}.fa-arrow-alt-circle-up:before{content:"\f35b"}.fa-arrow-circle-down:before{content:"\f0ab"}.fa-arrow-circle-left:before{content:"\f0a8"}.fa-arrow-circle-right:before{content:"\f0a9"}.fa-arrow-circle-up:before{content:"\f0aa"}.fa-arrow-down:before{content:"\f063"}.fa-arrow-left:before{content:"\f060"}.fa-arrow-right:before{content:"\f061"}.fa-arrow-up:before{content:"\f062"}.fa-arrows-alt:before{content:"\f0b2"}.fa-arrows-alt-h:before{content:"\f337"}.fa-arrows-alt-v:before{content:"\f338"}.fa-artstation:before{content:"\f77a"}.fa-assistive-listening-systems:before{content:"\f2a2"}.fa-asterisk:before{content:"\f069"}.fa-asymmetrik:before{content:"\f372"}.fa-at:before{content:"\f1fa"}.fa-atlas:before{content:"\f558"}.fa-atlassian:before{content:"\f77b"}.fa-atom:before{content:"\f5d2"}.fa-audible:before{content:"\f373"}.fa-audio-description:before{content:"\f29e"}.fa-autoprefixer:before{content:"\f41c"}.fa-avianex:before{content:"\f374"}.fa-aviato:before{content:"\f421"}.fa-award:before{content:"\f559"}.fa-aws:before{content:"\f375"}.fa-baby:before{content:"\f77c"}.fa-baby-carriage:before{content:"\f77d"}.fa-backspace:before{content:"\f55a"}.fa-backward:before{content:"\f04a"}.fa-bacon:before{content:"\f7e5"}.fa-bahai:before{content:"\f666"}.fa-balance-scale:before{content:"\f24e"}.fa-balance-scale-left:before{content:"\f515"}.fa-balance-scale-right:before{content:"\f516"}.fa-ban:before{content:"\f05e"}.fa-band-aid:before{content:"\f462"}.fa-bandcamp:before{content:"\f2d5"}.fa-barcode:before{content:"\f02a"}.fa-bars:before{content:"\f0c9"}.fa-baseball-ball:before{content:"\f433"}.fa-basketball-ball:before{content:"\f434"}.fa-bath:before{content:"\f2cd"}.fa-battery-empty:before{content:"\f244"}.fa-battery-full:before{content:"\f240"}.fa-battery-half:before{content:"\f242"}.fa-battery-quarter:before{content:"\f243"}.fa-battery-three-quarters:before{content:"\f241"}.fa-battle-net:before{content:"\f835"}.fa-bed:before{content:"\f236"}.fa-beer:before{content:"\f0fc"}.fa-behance:before{content:"\f1b4"}.fa-behance-square:before{content:"\f1b5"}.fa-bell:before{content:"\f0f3"}.fa-bell-slash:before{content:"\f1f6"}.fa-bezier-curve:before{content:"\f55b"}.fa-bible:before{content:"\f647"}.fa-bicycle:before{content:"\f206"}.fa-biking:before{content:"\f84a"}.fa-bimobject:before{content:"\f378"}.fa-binoculars:before{content:"\f1e5"}.fa-biohazard:before{content:"\f780"}.fa-birthday-cake:before{content:"\f1fd"}.fa-bitbucket:before{content:"\f171"}.fa-bitcoin:before{content:"\f379"}.fa-bity:before{content:"\f37a"}.fa-black-tie:before{content:"\f27e"}.fa-blackberry:before{content:"\f37b"}.fa-blender:before{content:"\f517"}.fa-blender-phone:before{content:"\f6b6"}.fa-blind:before{content:"\f29d"}.fa-blog:before{content:"\f781"}.fa-blogger:before{content:"\f37c"}.fa-blogger-b:before{content:"\f37d"}.fa-bluetooth:before{content:"\f293"}.fa-bluetooth-b:before{content:"\f294"}.fa-bold:before{content:"\f032"}.fa-bolt:before{content:"\f0e7"}.fa-bomb:before{content:"\f1e2"}.fa-bone:before{content:"\f5d7"}.fa-bong:before{content:"\f55c"}.fa-book:before{content:"\f02d"}.fa-book-dead:before{content:"\f6b7"}.fa-book-medical:before{content:"\f7e6"}.fa-book-open:before{content:"\f518"}.fa-book-reader:before{content:"\f5da"}.fa-bookmark:before{content:"\f02e"}.fa-bootstrap:before{content:"\f836"}.fa-border-all:before{content:"\f84c"}.fa-border-none:before{content:"\f850"}.fa-border-style:before{content:"\f853"}.fa-bowling-ball:before{content:"\f436"}.fa-box:before{content:"\f466"}.fa-box-open:before{content:"\f49e"}.fa-box-tissue:before{content:"\f95b"}.fa-boxes:before{content:"\f468"}.fa-braille:before{content:"\f2a1"}.fa-brain:before{content:"\f5dc"}.fa-bread-slice:before{content:"\f7ec"}.fa-briefcase:before{content:"\f0b1"}.fa-briefcase-medical:before{content:"\f469"}.fa-broadcast-tower:before{content:"\f519"}.fa-broom:before{content:"\f51a"}.fa-brush:before{content:"\f55d"}.fa-btc:before{content:"\f15a"}.fa-buffer:before{content:"\f837"}.fa-bug:before{content:"\f188"}.fa-building:before{content:"\f1ad"}.fa-bullhorn:before{content:"\f0a1"}.fa-bullseye:before{content:"\f140"}.fa-burn:before{content:"\f46a"}.fa-buromobelexperte:before{content:"\f37f"}.fa-bus:before{content:"\f207"}.fa-bus-alt:before{content:"\f55e"}.fa-business-time:before{content:"\f64a"}.fa-buy-n-large:before{content:"\f8a6"}.fa-buysellads:before{content:"\f20d"}.fa-calculator:before{content:"\f1ec"}.fa-calendar:before{content:"\f133"}.fa-calendar-alt:before{content:"\f073"}.fa-calendar-check:before{content:"\f274"}.fa-calendar-day:before{content:"\f783"}.fa-calendar-minus:before{content:"\f272"}.fa-calendar-plus:before{content:"\f271"}.fa-calendar-times:before{content:"\f273"}.fa-calendar-week:before{content:"\f784"}.fa-camera:before{content:"\f030"}.fa-camera-retro:before{content:"\f083"}.fa-campground:before{content:"\f6bb"}.fa-canadian-maple-leaf:before{content:"\f785"}.fa-candy-cane:before{content:"\f786"}.fa-cannabis:before{content:"\f55f"}.fa-capsules:before{content:"\f46b"}.fa-car:before{content:"\f1b9"}.fa-car-alt:before{content:"\f5de"}.fa-car-battery:before{content:"\f5df"}.fa-car-crash:before{content:"\f5e1"}.fa-car-side:before{content:"\f5e4"}.fa-caravan:before{content:"\f8ff"}.fa-caret-down:before{content:"\f0d7"}.fa-caret-left:before{content:"\f0d9"}.fa-caret-right:before{content:"\f0da"}.fa-caret-square-down:before{content:"\f150"}.fa-caret-square-left:before{content:"\f191"}.fa-caret-square-right:before{content:"\f152"}.fa-caret-square-up:before{content:"\f151"}.fa-caret-up:before{content:"\f0d8"}.fa-carrot:before{content:"\f787"}.fa-cart-arrow-down:before{content:"\f218"}.fa-cart-plus:before{content:"\f217"}.fa-cash-register:before{content:"\f788"}.fa-cat:before{content:"\f6be"}.fa-cc-amazon-pay:before{content:"\f42d"}.fa-cc-amex:before{content:"\f1f3"}.fa-cc-apple-pay:before{content:"\f416"}.fa-cc-diners-club:before{content:"\f24c"}.fa-cc-discover:before{content:"\f1f2"}.fa-cc-jcb:before{content:"\f24b"}.fa-cc-mastercard:before{content:"\f1f1"}.fa-cc-paypal:before{content:"\f1f4"}.fa-cc-stripe:before{content:"\f1f5"}.fa-cc-visa:before{content:"\f1f0"}.fa-centercode:before{content:"\f380"}.fa-centos:before{content:"\f789"}.fa-certificate:before{content:"\f0a3"}.fa-chair:before{content:"\f6c0"}.fa-chalkboard:before{content:"\f51b"}.fa-chalkboard-teacher:before{content:"\f51c"}.fa-charging-station:before{content:"\f5e7"}.fa-chart-area:before{content:"\f1fe"}.fa-chart-bar:before{content:"\f080"}.fa-chart-line:before{content:"\f201"}.fa-chart-pie:before{content:"\f200"}.fa-check:before{content:"\f00c"}.fa-check-circle:before{content:"\f058"}.fa-check-double:before{content:"\f560"}.fa-check-square:before{content:"\f14a"}.fa-cheese:before{content:"\f7ef"}.fa-chess:before{content:"\f439"}.fa-chess-bishop:before{content:"\f43a"}.fa-chess-board:before{content:"\f43c"}.fa-chess-king:before{content:"\f43f"}.fa-chess-knight:before{content:"\f441"}.fa-chess-pawn:before{content:"\f443"}.fa-chess-queen:before{content:"\f445"}.fa-chess-rook:before{content:"\f447"}.fa-chevron-circle-down:before{content:"\f13a"}.fa-chevron-circle-left:before{content:"\f137"}.fa-chevron-circle-right:before{content:"\f138"}.fa-chevron-circle-up:before{content:"\f139"}.fa-chevron-down:before{content:"\f078"}.fa-chevron-left:before{content:"\f053"}.fa-chevron-right:before{content:"\f054"}.fa-chevron-up:before{content:"\f077"}.fa-child:before{content:"\f1ae"}.fa-chrome:before{content:"\f268"}.fa-chromecast:before{content:"\f838"}.fa-church:before{content:"\f51d"}.fa-circle:before{content:"\f111"}.fa-circle-notch:before{content:"\f1ce"}.fa-city:before{content:"\f64f"}.fa-clinic-medical:before{content:"\f7f2"}.fa-clipboard:before{content:"\f328"}.fa-clipboard-check:before{content:"\f46c"}.fa-clipboard-list:before{content:"\f46d"}.fa-clock:before{content:"\f017"}.fa-clone:before{content:"\f24d"}.fa-closed-captioning:before{content:"\f20a"}.fa-cloud:before{content:"\f0c2"}.fa-cloud-download-alt:before{content:"\f381"}.fa-cloud-meatball:before{content:"\f73b"}.fa-cloud-moon:before{content:"\f6c3"}.fa-cloud-moon-rain:before{content:"\f73c"}.fa-cloud-rain:before{content:"\f73d"}.fa-cloud-showers-heavy:before{content:"\f740"}.fa-cloud-sun:before{content:"\f6c4"}.fa-cloud-sun-rain:before{content:"\f743"}.fa-cloud-upload-alt:before{content:"\f382"}.fa-cloudscale:before{content:"\f383"}.fa-cloudsmith:before{content:"\f384"}.fa-cloudversify:before{content:"\f385"}.fa-cocktail:before{content:"\f561"}.fa-code:before{content:"\f121"}.fa-code-branch:before{content:"\f126"}.fa-codepen:before{content:"\f1cb"}.fa-codiepie:before{content:"\f284"}.fa-coffee:before{content:"\f0f4"}.fa-cog:before{content:"\f013"}.fa-cogs:before{content:"\f085"}.fa-coins:before{content:"\f51e"}.fa-columns:before{content:"\f0db"}.fa-comment:before{content:"\f075"}.fa-comment-alt:before{content:"\f27a"}.fa-comment-dollar:before{content:"\f651"}.fa-comment-dots:before{content:"\f4ad"}.fa-comment-medical:before{content:"\f7f5"}.fa-comment-slash:before{content:"\f4b3"}.fa-comments:before{content:"\f086"}.fa-comments-dollar:before{content:"\f653"}.fa-compact-disc:before{content:"\f51f"}.fa-compass:before{content:"\f14e"}.fa-compress:before{content:"\f066"}.fa-compress-alt:before{content:"\f422"}.fa-compress-arrows-alt:before{content:"\f78c"}.fa-concierge-bell:before{content:"\f562"}.fa-confluence:before{content:"\f78d"}.fa-connectdevelop:before{content:"\f20e"}.fa-contao:before{content:"\f26d"}.fa-cookie:before{content:"\f563"}.fa-cookie-bite:before{content:"\f564"}.fa-copy:before{content:"\f0c5"}.fa-copyright:before{content:"\f1f9"}.fa-cotton-bureau:before{content:"\f89e"}.fa-couch:before{content:"\f4b8"}.fa-cpanel:before{content:"\f388"}.fa-creative-commons:before{content:"\f25e"}.fa-creative-commons-by:before{content:"\f4e7"}.fa-creative-commons-nc:before{content:"\f4e8"}.fa-creative-commons-nc-eu:before{content:"\f4e9"}.fa-creative-commons-nc-jp:before{content:"\f4ea"}.fa-creative-commons-nd:before{content:"\f4eb"}.fa-creative-commons-pd:before{content:"\f4ec"}.fa-creative-commons-pd-alt:before{content:"\f4ed"}.fa-creative-commons-remix:before{content:"\f4ee"}.fa-creative-commons-sa:before{content:"\f4ef"}.fa-creative-commons-sampling:before{content:"\f4f0"}.fa-creative-commons-sampling-plus:before{content:"\f4f1"}.fa-creative-commons-share:before{content:"\f4f2"}.fa-creative-commons-zero:before{content:"\f4f3"}.fa-credit-card:before{content:"\f09d"}.fa-critical-role:before{content:"\f6c9"}.fa-crop:before{content:"\f125"}.fa-crop-alt:before{content:"\f565"}.fa-cross:before{content:"\f654"}.fa-crosshairs:before{content:"\f05b"}.fa-crow:before{content:"\f520"}.fa-crown:before{content:"\f521"}.fa-crutch:before{content:"\f7f7"}.fa-css3:before{content:"\f13c"}.fa-css3-alt:before{content:"\f38b"}.fa-cube:before{content:"\f1b2"}.fa-cubes:before{content:"\f1b3"}.fa-cut:before{content:"\f0c4"}.fa-cuttlefish:before{content:"\f38c"}.fa-d-and-d:before{content:"\f38d"}.fa-d-and-d-beyond:before{content:"\f6ca"}.fa-dailymotion:before{content:"\f952"}.fa-dashcube:before{content:"\f210"}.fa-database:before{content:"\f1c0"}.fa-deaf:before{content:"\f2a4"}.fa-delicious:before{content:"\f1a5"}.fa-democrat:before{content:"\f747"}.fa-deploydog:before{content:"\f38e"}.fa-deskpro:before{content:"\f38f"}.fa-desktop:before{content:"\f108"}.fa-dev:before{content:"\f6cc"}.fa-deviantart:before{content:"\f1bd"}.fa-dharmachakra:before{content:"\f655"}.fa-dhl:before{content:"\f790"}.fa-diagnoses:before{content:"\f470"}.fa-diaspora:before{content:"\f791"}.fa-dice:before{content:"\f522"}.fa-dice-d20:before{content:"\f6cf"}.fa-dice-d6:before{content:"\f6d1"}.fa-dice-five:before{content:"\f523"}.fa-dice-four:before{content:"\f524"}.fa-dice-one:before{content:"\f525"}.fa-dice-six:before{content:"\f526"}.fa-dice-three:before{content:"\f527"}.fa-dice-two:before{content:"\f528"}.fa-digg:before{content:"\f1a6"}.fa-digital-ocean:before{content:"\f391"}.fa-digital-tachograph:before{content:"\f566"}.fa-directions:before{content:"\f5eb"}.fa-discord:before{content:"\f392"}.fa-discourse:before{content:"\f393"}.fa-disease:before{content:"\f7fa"}.fa-divide:before{content:"\f529"}.fa-dizzy:before{content:"\f567"}.fa-dna:before{content:"\f471"}.fa-dochub:before{content:"\f394"}.fa-docker:before{content:"\f395"}.fa-dog:before{content:"\f6d3"}.fa-dollar-sign:before{content:"\f155"}.fa-dolly:before{content:"\f472"}.fa-dolly-flatbed:before{content:"\f474"}.fa-donate:before{content:"\f4b9"}.fa-door-closed:before{content:"\f52a"}.fa-door-open:before{content:"\f52b"}.fa-dot-circle:before{content:"\f192"}.fa-dove:before{content:"\f4ba"}.fa-download:before{content:"\f019"}.fa-draft2digital:before{content:"\f396"}.fa-drafting-compass:before{content:"\f568"}.fa-dragon:before{content:"\f6d5"}.fa-draw-polygon:before{content:"\f5ee"}.fa-dribbble:before{content:"\f17d"}.fa-dribbble-square:before{content:"\f397"}.fa-dropbox:before{content:"\f16b"}.fa-drum:before{content:"\f569"}.fa-drum-steelpan:before{content:"\f56a"}.fa-drumstick-bite:before{content:"\f6d7"}.fa-drupal:before{content:"\f1a9"}.fa-dumbbell:before{content:"\f44b"}.fa-dumpster:before{content:"\f793"}.fa-dumpster-fire:before{content:"\f794"}.fa-dungeon:before{content:"\f6d9"}.fa-dyalog:before{content:"\f399"}.fa-earlybirds:before{content:"\f39a"}.fa-ebay:before{content:"\f4f4"}.fa-edge:before{content:"\f282"}.fa-edit:before{content:"\f044"}.fa-egg:before{content:"\f7fb"}.fa-eject:before{content:"\f052"}.fa-elementor:before{content:"\f430"}.fa-ellipsis-h:before{content:"\f141"}.fa-ellipsis-v:before{content:"\f142"}.fa-ello:before{content:"\f5f1"}.fa-ember:before{content:"\f423"}.fa-empire:before{content:"\f1d1"}.fa-envelope:before{content:"\f0e0"}.fa-envelope-open:before{content:"\f2b6"}.fa-envelope-open-text:before{content:"\f658"}.fa-envelope-square:before{content:"\f199"}.fa-envira:before{content:"\f299"}.fa-equals:before{content:"\f52c"}.fa-eraser:before{content:"\f12d"}.fa-erlang:before{content:"\f39d"}.fa-ethereum:before{content:"\f42e"}.fa-ethernet:before{content:"\f796"}.fa-etsy:before{content:"\f2d7"}.fa-euro-sign:before{content:"\f153"}.fa-evernote:before{content:"\f839"}.fa-exchange-alt:before{content:"\f362"}.fa-exclamation:before{content:"\f12a"}.fa-exclamation-circle:before{content:"\f06a"}.fa-exclamation-triangle:before{content:"\f071"}.fa-expand:before{content:"\f065"}.fa-expand-alt:before{content:"\f424"}.fa-expand-arrows-alt:before{content:"\f31e"}.fa-expeditedssl:before{content:"\f23e"}.fa-external-link-alt:before{content:"\f35d"}.fa-external-link-square-alt:before{content:"\f360"}.fa-eye:before{content:"\f06e"}.fa-eye-dropper:before{content:"\f1fb"}.fa-eye-slash:before{content:"\f070"}.fa-facebook:before{content:"\f09a"}.fa-facebook-f:before{content:"\f39e"}.fa-facebook-messenger:before{content:"\f39f"}.fa-facebook-square:before{content:"\f082"}.fa-fan:before{content:"\f863"}.fa-fantasy-flight-games:before{content:"\f6dc"}.fa-fast-backward:before{content:"\f049"}.fa-fast-forward:before{content:"\f050"}.fa-faucet:before{content:"\f905"}.fa-fax:before{content:"\f1ac"}.fa-feather:before{content:"\f52d"}.fa-feather-alt:before{content:"\f56b"}.fa-fedex:before{content:"\f797"}.fa-fedora:before{content:"\f798"}.fa-female:before{content:"\f182"}.fa-fighter-jet:before{content:"\f0fb"}.fa-figma:before{content:"\f799"}.fa-file:before{content:"\f15b"}.fa-file-alt:before{content:"\f15c"}.fa-file-archive:before{content:"\f1c6"}.fa-file-audio:before{content:"\f1c7"}.fa-file-code:before{content:"\f1c9"}.fa-file-contract:before{content:"\f56c"}.fa-file-csv:before{content:"\f6dd"}.fa-file-download:before{content:"\f56d"}.fa-file-excel:before{content:"\f1c3"}.fa-file-export:before{content:"\f56e"}.fa-file-image:before{content:"\f1c5"}.fa-file-import:before{content:"\f56f"}.fa-file-invoice:before{content:"\f570"}.fa-file-invoice-dollar:before{content:"\f571"}.fa-file-medical:before{content:"\f477"}.fa-file-medical-alt:before{content:"\f478"}.fa-file-pdf:before{content:"\f1c1"}.fa-file-powerpoint:before{content:"\f1c4"}.fa-file-prescription:before{content:"\f572"}.fa-file-signature:before{content:"\f573"}.fa-file-upload:before{content:"\f574"}.fa-file-video:before{content:"\f1c8"}.fa-file-word:before{content:"\f1c2"}.fa-fill:before{content:"\f575"}.fa-fill-drip:before{content:"\f576"}.fa-film:before{content:"\f008"}.fa-filter:before{content:"\f0b0"}.fa-fingerprint:before{content:"\f577"}.fa-fire:before{content:"\f06d"}.fa-fire-alt:before{content:"\f7e4"}.fa-fire-extinguisher:before{content:"\f134"}.fa-firefox:before{content:"\f269"}.fa-firefox-browser:before{content:"\f907"}.fa-first-aid:before{content:"\f479"}.fa-first-order:before{content:"\f2b0"}.fa-first-order-alt:before{content:"\f50a"}.fa-firstdraft:before{content:"\f3a1"}.fa-fish:before{content:"\f578"}.fa-fist-raised:before{content:"\f6de"}.fa-flag:before{content:"\f024"}.fa-flag-checkered:before{content:"\f11e"}.fa-flag-usa:before{content:"\f74d"}.fa-flask:before{content:"\f0c3"}.fa-flickr:before{content:"\f16e"}.fa-flipboard:before{content:"\f44d"}.fa-flushed:before{content:"\f579"}.fa-fly:before{content:"\f417"}.fa-folder:before{content:"\f07b"}.fa-folder-minus:before{content:"\f65d"}.fa-folder-open:before{content:"\f07c"}.fa-folder-plus:before{content:"\f65e"}.fa-font:before{content:"\f031"}.fa-font-awesome:before{content:"\f2b4"}.fa-font-awesome-alt:before{content:"\f35c"}.fa-font-awesome-flag:before{content:"\f425"}.fa-font-awesome-logo-full:before{content:"\f4e6"}.fa-fonticons:before{content:"\f280"}.fa-fonticons-fi:before{content:"\f3a2"}.fa-football-ball:before{content:"\f44e"}.fa-fort-awesome:before{content:"\f286"}.fa-fort-awesome-alt:before{content:"\f3a3"}.fa-forumbee:before{content:"\f211"}.fa-forward:before{content:"\f04e"}.fa-foursquare:before{content:"\f180"}.fa-free-code-camp:before{content:"\f2c5"}.fa-freebsd:before{content:"\f3a4"}.fa-frog:before{content:"\f52e"}.fa-frown:before{content:"\f119"}.fa-frown-open:before{content:"\f57a"}.fa-fulcrum:before{content:"\f50b"}.fa-funnel-dollar:before{content:"\f662"}.fa-futbol:before{content:"\f1e3"}.fa-galactic-republic:before{content:"\f50c"}.fa-galactic-senate:before{content:"\f50d"}.fa-gamepad:before{content:"\f11b"}.fa-gas-pump:before{content:"\f52f"}.fa-gavel:before{content:"\f0e3"}.fa-gem:before{content:"\f3a5"}.fa-genderless:before{content:"\f22d"}.fa-get-pocket:before{content:"\f265"}.fa-gg:before{content:"\f260"}.fa-gg-circle:before{content:"\f261"}.fa-ghost:before{content:"\f6e2"}.fa-gift:before{content:"\f06b"}.fa-gifts:before{content:"\f79c"}.fa-git:before{content:"\f1d3"}.fa-git-alt:before{content:"\f841"}.fa-git-square:before{content:"\f1d2"}.fa-github:before{content:"\f09b"}.fa-github-alt:before{content:"\f113"}.fa-github-square:before{content:"\f092"}.fa-gitkraken:before{content:"\f3a6"}.fa-gitlab:before{content:"\f296"}.fa-gitter:before{content:"\f426"}.fa-glass-cheers:before{content:"\f79f"}.fa-glass-martini:before{content:"\f000"}.fa-glass-martini-alt:before{content:"\f57b"}.fa-glass-whiskey:before{content:"\f7a0"}.fa-glasses:before{content:"\f530"}.fa-glide:before{content:"\f2a5"}.fa-glide-g:before{content:"\f2a6"}.fa-globe:before{content:"\f0ac"}.fa-globe-africa:before{content:"\f57c"}.fa-globe-americas:before{content:"\f57d"}.fa-globe-asia:before{content:"\f57e"}.fa-globe-europe:before{content:"\f7a2"}.fa-gofore:before{content:"\f3a7"}.fa-golf-ball:before{content:"\f450"}.fa-goodreads:before{content:"\f3a8"}.fa-goodreads-g:before{content:"\f3a9"}.fa-google:before{content:"\f1a0"}.fa-google-drive:before{content:"\f3aa"}.fa-google-play:before{content:"\f3ab"}.fa-google-plus:before{content:"\f2b3"}.fa-google-plus-g:before{content:"\f0d5"}.fa-google-plus-square:before{content:"\f0d4"}.fa-google-wallet:before{content:"\f1ee"}.fa-gopuram:before{content:"\f664"}.fa-graduation-cap:before{content:"\f19d"}.fa-gratipay:before{content:"\f184"}.fa-grav:before{content:"\f2d6"}.fa-greater-than:before{content:"\f531"}.fa-greater-than-equal:before{content:"\f532"}.fa-grimace:before{content:"\f57f"}.fa-grin:before{content:"\f580"}.fa-grin-alt:before{content:"\f581"}.fa-grin-beam:before{content:"\f582"}.fa-grin-beam-sweat:before{content:"\f583"}.fa-grin-hearts:before{content:"\f584"}.fa-grin-squint:before{content:"\f585"}.fa-grin-squint-tears:before{content:"\f586"}.fa-grin-stars:before{content:"\f587"}.fa-grin-tears:before{content:"\f588"}.fa-grin-tongue:before{content:"\f589"}.fa-grin-tongue-squint:before{content:"\f58a"}.fa-grin-tongue-wink:before{content:"\f58b"}.fa-grin-wink:before{content:"\f58c"}.fa-grip-horizontal:before{content:"\f58d"}.fa-grip-lines:before{content:"\f7a4"}.fa-grip-lines-vertical:before{content:"\f7a5"}.fa-grip-vertical:before{content:"\f58e"}.fa-gripfire:before{content:"\f3ac"}.fa-grunt:before{content:"\f3ad"}.fa-guitar:before{content:"\f7a6"}.fa-gulp:before{content:"\f3ae"}.fa-h-square:before{content:"\f0fd"}.fa-hacker-news:before{content:"\f1d4"}.fa-hacker-news-square:before{content:"\f3af"}.fa-hackerrank:before{content:"\f5f7"}.fa-hamburger:before{content:"\f805"}.fa-hammer:before{content:"\f6e3"}.fa-hamsa:before{content:"\f665"}.fa-hand-holding:before{content:"\f4bd"}.fa-hand-holding-heart:before{content:"\f4be"}.fa-hand-holding-medical:before{content:"\f95c"}.fa-hand-holding-usd:before{content:"\f4c0"}.fa-hand-holding-water:before{content:"\f4c1"}.fa-hand-lizard:before{content:"\f258"}.fa-hand-middle-finger:before{content:"\f806"}.fa-hand-paper:before{content:"\f256"}.fa-hand-peace:before{content:"\f25b"}.fa-hand-point-down:before{content:"\f0a7"}.fa-hand-point-left:before{content:"\f0a5"}.fa-hand-point-right:before{content:"\f0a4"}.fa-hand-point-up:before{content:"\f0a6"}.fa-hand-pointer:before{content:"\f25a"}.fa-hand-rock:before{content:"\f255"}.fa-hand-scissors:before{content:"\f257"}.fa-hand-sparkles:before{content:"\f95d"}.fa-hand-spock:before{content:"\f259"}.fa-hands:before{content:"\f4c2"}.fa-hands-helping:before{content:"\f4c4"}.fa-hands-wash:before{content:"\f95e"}.fa-handshake:before{content:"\f2b5"}.fa-handshake-alt-slash:before{content:"\f95f"}.fa-handshake-slash:before{content:"\f960"}.fa-hanukiah:before{content:"\f6e6"}.fa-hard-hat:before{content:"\f807"}.fa-hashtag:before{content:"\f292"}.fa-hat-cowboy:before{content:"\f8c0"}.fa-hat-cowboy-side:before{content:"\f8c1"}.fa-hat-wizard:before{content:"\f6e8"}.fa-hdd:before{content:"\f0a0"}.fa-head-side-cough:before{content:"\f961"}.fa-head-side-cough-slash:before{content:"\f962"}.fa-head-side-mask:before{content:"\f963"}.fa-head-side-virus:before{content:"\f964"}.fa-heading:before{content:"\f1dc"}.fa-headphones:before{content:"\f025"}.fa-headphones-alt:before{content:"\f58f"}.fa-headset:before{content:"\f590"}.fa-heart:before{content:"\f004"}.fa-heart-broken:before{content:"\f7a9"}.fa-heartbeat:before{content:"\f21e"}.fa-helicopter:before{content:"\f533"}.fa-highlighter:before{content:"\f591"}.fa-hiking:before{content:"\f6ec"}.fa-hippo:before{content:"\f6ed"}.fa-hips:before{content:"\f452"}.fa-hire-a-helper:before{content:"\f3b0"}.fa-history:before{content:"\f1da"}.fa-hockey-puck:before{content:"\f453"}.fa-holly-berry:before{content:"\f7aa"}.fa-home:before{content:"\f015"}.fa-hooli:before{content:"\f427"}.fa-hornbill:before{content:"\f592"}.fa-horse:before{content:"\f6f0"}.fa-horse-head:before{content:"\f7ab"}.fa-hospital:before{content:"\f0f8"}.fa-hospital-alt:before{content:"\f47d"}.fa-hospital-symbol:before{content:"\f47e"}.fa-hospital-user:before{content:"\f80d"}.fa-hot-tub:before{content:"\f593"}.fa-hotdog:before{content:"\f80f"}.fa-hotel:before{content:"\f594"}.fa-hotjar:before{content:"\f3b1"}.fa-hourglass:before{content:"\f254"}.fa-hourglass-end:before{content:"\f253"}.fa-hourglass-half:before{content:"\f252"}.fa-hourglass-start:before{content:"\f251"}.fa-house-damage:before{content:"\f6f1"}.fa-house-user:before{content:"\f965"}.fa-houzz:before{content:"\f27c"}.fa-hryvnia:before{content:"\f6f2"}.fa-html5:before{content:"\f13b"}.fa-hubspot:before{content:"\f3b2"}.fa-i-cursor:before{content:"\f246"}.fa-ice-cream:before{content:"\f810"}.fa-icicles:before{content:"\f7ad"}.fa-icons:before{content:"\f86d"}.fa-id-badge:before{content:"\f2c1"}.fa-id-card:before{content:"\f2c2"}.fa-id-card-alt:before{content:"\f47f"}.fa-ideal:before{content:"\f913"}.fa-igloo:before{content:"\f7ae"}.fa-image:before{content:"\f03e"}.fa-images:before{content:"\f302"}.fa-imdb:before{content:"\f2d8"}.fa-inbox:before{content:"\f01c"}.fa-indent:before{content:"\f03c"}.fa-industry:before{content:"\f275"}.fa-infinity:before{content:"\f534"}.fa-info:before{content:"\f129"}.fa-info-circle:before{content:"\f05a"}.fa-instagram:before{content:"\f16d"}.fa-instagram-square:before{content:"\f955"}.fa-intercom:before{content:"\f7af"}.fa-internet-explorer:before{content:"\f26b"}.fa-invision:before{content:"\f7b0"}.fa-ioxhost:before{content:"\f208"}.fa-italic:before{content:"\f033"}.fa-itch-io:before{content:"\f83a"}.fa-itunes:before{content:"\f3b4"}.fa-itunes-note:before{content:"\f3b5"}.fa-java:before{content:"\f4e4"}.fa-jedi:before{content:"\f669"}.fa-jedi-order:before{content:"\f50e"}.fa-jenkins:before{content:"\f3b6"}.fa-jira:before{content:"\f7b1"}.fa-joget:before{content:"\f3b7"}.fa-joint:before{content:"\f595"}.fa-joomla:before{content:"\f1aa"}.fa-journal-whills:before{content:"\f66a"}.fa-js:before{content:"\f3b8"}.fa-js-square:before{content:"\f3b9"}.fa-jsfiddle:before{content:"\f1cc"}.fa-kaaba:before{content:"\f66b"}.fa-kaggle:before{content:"\f5fa"}.fa-key:before{content:"\f084"}.fa-keybase:before{content:"\f4f5"}.fa-keyboard:before{content:"\f11c"}.fa-keycdn:before{content:"\f3ba"}.fa-khanda:before{content:"\f66d"}.fa-kickstarter:before{content:"\f3bb"}.fa-kickstarter-k:before{content:"\f3bc"}.fa-kiss:before{content:"\f596"}.fa-kiss-beam:before{content:"\f597"}.fa-kiss-wink-heart:before{content:"\f598"}.fa-kiwi-bird:before{content:"\f535"}.fa-korvue:before{content:"\f42f"}.fa-landmark:before{content:"\f66f"}.fa-language:before{content:"\f1ab"}.fa-laptop:before{content:"\f109"}.fa-laptop-code:before{content:"\f5fc"}.fa-laptop-house:before{content:"\f966"}.fa-laptop-medical:before{content:"\f812"}.fa-laravel:before{content:"\f3bd"}.fa-lastfm:before{content:"\f202"}.fa-lastfm-square:before{content:"\f203"}.fa-laugh:before{content:"\f599"}.fa-laugh-beam:before{content:"\f59a"}.fa-laugh-squint:before{content:"\f59b"}.fa-laugh-wink:before{content:"\f59c"}.fa-layer-group:before{content:"\f5fd"}.fa-leaf:before{content:"\f06c"}.fa-leanpub:before{content:"\f212"}.fa-lemon:before{content:"\f094"}.fa-less:before{content:"\f41d"}.fa-less-than:before{content:"\f536"}.fa-less-than-equal:before{content:"\f537"}.fa-level-down-alt:before{content:"\f3be"}.fa-level-up-alt:before{content:"\f3bf"}.fa-life-ring:before{content:"\f1cd"}.fa-lightbulb:before{content:"\f0eb"}.fa-line:before{content:"\f3c0"}.fa-link:before{content:"\f0c1"}.fa-linkedin:before{content:"\f08c"}.fa-linkedin-in:before{content:"\f0e1"}.fa-linode:before{content:"\f2b8"}.fa-linux:before{content:"\f17c"}.fa-lira-sign:before{content:"\f195"}.fa-list:before{content:"\f03a"}.fa-list-alt:before{content:"\f022"}.fa-list-ol:before{content:"\f0cb"}.fa-list-ul:before{content:"\f0ca"}.fa-location-arrow:before{content:"\f124"}.fa-lock:before{content:"\f023"}.fa-lock-open:before{content:"\f3c1"}.fa-long-arrow-alt-down:before{content:"\f309"}.fa-long-arrow-alt-left:before{content:"\f30a"}.fa-long-arrow-alt-right:before{content:"\f30b"}.fa-long-arrow-alt-up:before{content:"\f30c"}.fa-low-vision:before{content:"\f2a8"}.fa-luggage-cart:before{content:"\f59d"}.fa-lungs:before{content:"\f604"}.fa-lungs-virus:before{content:"\f967"}.fa-lyft:before{content:"\f3c3"}.fa-magento:before{content:"\f3c4"}.fa-magic:before{content:"\f0d0"}.fa-magnet:before{content:"\f076"}.fa-mail-bulk:before{content:"\f674"}.fa-mailchimp:before{content:"\f59e"}.fa-male:before{content:"\f183"}.fa-mandalorian:before{content:"\f50f"}.fa-map:before{content:"\f279"}.fa-map-marked:before{content:"\f59f"}.fa-map-marked-alt:before{content:"\f5a0"}.fa-map-marker:before{content:"\f041"}.fa-map-marker-alt:before{content:"\f3c5"}.fa-map-pin:before{content:"\f276"}.fa-map-signs:before{content:"\f277"}.fa-markdown:before{content:"\f60f"}.fa-marker:before{content:"\f5a1"}.fa-mars:before{content:"\f222"}.fa-mars-double:before{content:"\f227"}.fa-mars-stroke:before{content:"\f229"}.fa-mars-stroke-h:before{content:"\f22b"}.fa-mars-stroke-v:before{content:"\f22a"}.fa-mask:before{content:"\f6fa"}.fa-mastodon:before{content:"\f4f6"}.fa-maxcdn:before{content:"\f136"}.fa-mdb:before{content:"\f8ca"}.fa-medal:before{content:"\f5a2"}.fa-medapps:before{content:"\f3c6"}.fa-medium:before{content:"\f23a"}.fa-medium-m:before{content:"\f3c7"}.fa-medkit:before{content:"\f0fa"}.fa-medrt:before{content:"\f3c8"}.fa-meetup:before{content:"\f2e0"}.fa-megaport:before{content:"\f5a3"}.fa-meh:before{content:"\f11a"}.fa-meh-blank:before{content:"\f5a4"}.fa-meh-rolling-eyes:before{content:"\f5a5"}.fa-memory:before{content:"\f538"}.fa-mendeley:before{content:"\f7b3"}.fa-menorah:before{content:"\f676"}.fa-mercury:before{content:"\f223"}.fa-meteor:before{content:"\f753"}.fa-microblog:before{content:"\f91a"}.fa-microchip:before{content:"\f2db"}.fa-microphone:before{content:"\f130"}.fa-microphone-alt:before{content:"\f3c9"}.fa-microphone-alt-slash:before{content:"\f539"}.fa-microphone-slash:before{content:"\f131"}.fa-microscope:before{content:"\f610"}.fa-microsoft:before{content:"\f3ca"}.fa-minus:before{content:"\f068"}.fa-minus-circle:before{content:"\f056"}.fa-minus-square:before{content:"\f146"}.fa-mitten:before{content:"\f7b5"}.fa-mix:before{content:"\f3cb"}.fa-mixcloud:before{content:"\f289"}.fa-mixer:before{content:"\f956"}.fa-mizuni:before{content:"\f3cc"}.fa-mobile:before{content:"\f10b"}.fa-mobile-alt:before{content:"\f3cd"}.fa-modx:before{content:"\f285"}.fa-monero:before{content:"\f3d0"}.fa-money-bill:before{content:"\f0d6"}.fa-money-bill-alt:before{content:"\f3d1"}.fa-money-bill-wave:before{content:"\f53a"}.fa-money-bill-wave-alt:before{content:"\f53b"}.fa-money-check:before{content:"\f53c"}.fa-money-check-alt:before{content:"\f53d"}.fa-monument:before{content:"\f5a6"}.fa-moon:before{content:"\f186"}.fa-mortar-pestle:before{content:"\f5a7"}.fa-mosque:before{content:"\f678"}.fa-motorcycle:before{content:"\f21c"}.fa-mountain:before{content:"\f6fc"}.fa-mouse:before{content:"\f8cc"}.fa-mouse-pointer:before{content:"\f245"}.fa-mug-hot:before{content:"\f7b6"}.fa-music:before{content:"\f001"}.fa-napster:before{content:"\f3d2"}.fa-neos:before{content:"\f612"}.fa-network-wired:before{content:"\f6ff"}.fa-neuter:before{content:"\f22c"}.fa-newspaper:before{content:"\f1ea"}.fa-nimblr:before{content:"\f5a8"}.fa-node:before{content:"\f419"}.fa-node-js:before{content:"\f3d3"}.fa-not-equal:before{content:"\f53e"}.fa-notes-medical:before{content:"\f481"}.fa-npm:before{content:"\f3d4"}.fa-ns8:before{content:"\f3d5"}.fa-nutritionix:before{content:"\f3d6"}.fa-object-group:before{content:"\f247"}.fa-object-ungroup:before{content:"\f248"}.fa-odnoklassniki:before{content:"\f263"}.fa-odnoklassniki-square:before{content:"\f264"}.fa-oil-can:before{content:"\f613"}.fa-old-republic:before{content:"\f510"}.fa-om:before{content:"\f679"}.fa-opencart:before{content:"\f23d"}.fa-openid:before{content:"\f19b"}.fa-opera:before{content:"\f26a"}.fa-optin-monster:before{content:"\f23c"}.fa-orcid:before{content:"\f8d2"}.fa-osi:before{content:"\f41a"}.fa-otter:before{content:"\f700"}.fa-outdent:before{content:"\f03b"}.fa-page4:before{content:"\f3d7"}.fa-pagelines:before{content:"\f18c"}.fa-pager:before{content:"\f815"}.fa-paint-brush:before{content:"\f1fc"}.fa-paint-roller:before{content:"\f5aa"}.fa-palette:before{content:"\f53f"}.fa-palfed:before{content:"\f3d8"}.fa-pallet:before{content:"\f482"}.fa-paper-plane:before{content:"\f1d8"}.fa-paperclip:before{content:"\f0c6"}.fa-parachute-box:before{content:"\f4cd"}.fa-paragraph:before{content:"\f1dd"}.fa-parking:before{content:"\f540"}.fa-passport:before{content:"\f5ab"}.fa-pastafarianism:before{content:"\f67b"}.fa-paste:before{content:"\f0ea"}.fa-patreon:before{content:"\f3d9"}.fa-pause:before{content:"\f04c"}.fa-pause-circle:before{content:"\f28b"}.fa-paw:before{content:"\f1b0"}.fa-paypal:before{content:"\f1ed"}.fa-peace:before{content:"\f67c"}.fa-pen:before{content:"\f304"}.fa-pen-alt:before{content:"\f305"}.fa-pen-fancy:before{content:"\f5ac"}.fa-pen-nib:before{content:"\f5ad"}.fa-pen-square:before{content:"\f14b"}.fa-pencil-alt:before{content:"\f303"}.fa-pencil-ruler:before{content:"\f5ae"}.fa-penny-arcade:before{content:"\f704"}.fa-people-arrows:before{content:"\f968"}.fa-people-carry:before{content:"\f4ce"}.fa-pepper-hot:before{content:"\f816"}.fa-percent:before{content:"\f295"}.fa-percentage:before{content:"\f541"}.fa-periscope:before{content:"\f3da"}.fa-person-booth:before{content:"\f756"}.fa-phabricator:before{content:"\f3db"}.fa-phoenix-framework:before{content:"\f3dc"}.fa-phoenix-squadron:before{content:"\f511"}.fa-phone:before{content:"\f095"}.fa-phone-alt:before{content:"\f879"}.fa-phone-slash:before{content:"\f3dd"}.fa-phone-square:before{content:"\f098"}.fa-phone-square-alt:before{content:"\f87b"}.fa-phone-volume:before{content:"\f2a0"}.fa-photo-video:before{content:"\f87c"}.fa-php:before{content:"\f457"}.fa-pied-piper:before{content:"\f2ae"}.fa-pied-piper-alt:before{content:"\f1a8"}.fa-pied-piper-hat:before{content:"\f4e5"}.fa-pied-piper-pp:before{content:"\f1a7"}.fa-pied-piper-square:before{content:"\f91e"}.fa-piggy-bank:before{content:"\f4d3"}.fa-pills:before{content:"\f484"}.fa-pinterest:before{content:"\f0d2"}.fa-pinterest-p:before{content:"\f231"}.fa-pinterest-square:before{content:"\f0d3"}.fa-pizza-slice:before{content:"\f818"}.fa-place-of-worship:before{content:"\f67f"}.fa-plane:before{content:"\f072"}.fa-plane-arrival:before{content:"\f5af"}.fa-plane-departure:before{content:"\f5b0"}.fa-plane-slash:before{content:"\f969"}.fa-play:before{content:"\f04b"}.fa-play-circle:before{content:"\f144"}.fa-playstation:before{content:"\f3df"}.fa-plug:before{content:"\f1e6"}.fa-plus:before{content:"\f067"}.fa-plus-circle:before{content:"\f055"}.fa-plus-square:before{content:"\f0fe"}.fa-podcast:before{content:"\f2ce"}.fa-poll:before{content:"\f681"}.fa-poll-h:before{content:"\f682"}.fa-poo:before{content:"\f2fe"}.fa-poo-storm:before{content:"\f75a"}.fa-poop:before{content:"\f619"}.fa-portrait:before{content:"\f3e0"}.fa-pound-sign:before{content:"\f154"}.fa-power-off:before{content:"\f011"}.fa-pray:before{content:"\f683"}.fa-praying-hands:before{content:"\f684"}.fa-prescription:before{content:"\f5b1"}.fa-prescription-bottle:before{content:"\f485"}.fa-prescription-bottle-alt:before{content:"\f486"}.fa-print:before{content:"\f02f"}.fa-procedures:before{content:"\f487"}.fa-product-hunt:before{content:"\f288"}.fa-project-diagram:before{content:"\f542"}.fa-pump-medical:before{content:"\f96a"}.fa-pump-soap:before{content:"\f96b"}.fa-pushed:before{content:"\f3e1"}.fa-puzzle-piece:before{content:"\f12e"}.fa-python:before{content:"\f3e2"}.fa-qq:before{content:"\f1d6"}.fa-qrcode:before{content:"\f029"}.fa-question:before{content:"\f128"}.fa-question-circle:before{content:"\f059"}.fa-quidditch:before{content:"\f458"}.fa-quinscape:before{content:"\f459"}.fa-quora:before{content:"\f2c4"}.fa-quote-left:before{content:"\f10d"}.fa-quote-right:before{content:"\f10e"}.fa-quran:before{content:"\f687"}.fa-r-project:before{content:"\f4f7"}.fa-radiation:before{content:"\f7b9"}.fa-radiation-alt:before{content:"\f7ba"}.fa-rainbow:before{content:"\f75b"}.fa-random:before{content:"\f074"}.fa-raspberry-pi:before{content:"\f7bb"}.fa-ravelry:before{content:"\f2d9"}.fa-react:before{content:"\f41b"}.fa-reacteurope:before{content:"\f75d"}.fa-readme:before{content:"\f4d5"}.fa-rebel:before{content:"\f1d0"}.fa-receipt:before{content:"\f543"}.fa-record-vinyl:before{content:"\f8d9"}.fa-recycle:before{content:"\f1b8"}.fa-red-river:before{content:"\f3e3"}.fa-reddit:before{content:"\f1a1"}.fa-reddit-alien:before{content:"\f281"}.fa-reddit-square:before{content:"\f1a2"}.fa-redhat:before{content:"\f7bc"}.fa-redo:before{content:"\f01e"}.fa-redo-alt:before{content:"\f2f9"}.fa-registered:before{content:"\f25d"}.fa-remove-format:before{content:"\f87d"}.fa-renren:before{content:"\f18b"}.fa-reply:before{content:"\f3e5"}.fa-reply-all:before{content:"\f122"}.fa-replyd:before{content:"\f3e6"}.fa-republican:before{content:"\f75e"}.fa-researchgate:before{content:"\f4f8"}.fa-resolving:before{content:"\f3e7"}.fa-restroom:before{content:"\f7bd"}.fa-retweet:before{content:"\f079"}.fa-rev:before{content:"\f5b2"}.fa-ribbon:before{content:"\f4d6"}.fa-ring:before{content:"\f70b"}.fa-road:before{content:"\f018"}.fa-robot:before{content:"\f544"}.fa-rocket:before{content:"\f135"}.fa-rocketchat:before{content:"\f3e8"}.fa-rockrms:before{content:"\f3e9"}.fa-route:before{content:"\f4d7"}.fa-rss:before{content:"\f09e"}.fa-rss-square:before{content:"\f143"}.fa-ruble-sign:before{content:"\f158"}.fa-ruler:before{content:"\f545"}.fa-ruler-combined:before{content:"\f546"}.fa-ruler-horizontal:before{content:"\f547"}.fa-ruler-vertical:before{content:"\f548"}.fa-running:before{content:"\f70c"}.fa-rupee-sign:before{content:"\f156"}.fa-sad-cry:before{content:"\f5b3"}.fa-sad-tear:before{content:"\f5b4"}.fa-safari:before{content:"\f267"}.fa-salesforce:before{content:"\f83b"}.fa-sass:before{content:"\f41e"}.fa-satellite:before{content:"\f7bf"}.fa-satellite-dish:before{content:"\f7c0"}.fa-save:before{content:"\f0c7"}.fa-schlix:before{content:"\f3ea"}.fa-school:before{content:"\f549"}.fa-screwdriver:before{content:"\f54a"}.fa-scribd:before{content:"\f28a"}.fa-scroll:before{content:"\f70e"}.fa-sd-card:before{content:"\f7c2"}.fa-search:before{content:"\f002"}.fa-search-dollar:before{content:"\f688"}.fa-search-location:before{content:"\f689"}.fa-search-minus:before{content:"\f010"}.fa-search-plus:before{content:"\f00e"}.fa-searchengin:before{content:"\f3eb"}.fa-seedling:before{content:"\f4d8"}.fa-sellcast:before{content:"\f2da"}.fa-sellsy:before{content:"\f213"}.fa-server:before{content:"\f233"}.fa-servicestack:before{content:"\f3ec"}.fa-shapes:before{content:"\f61f"}.fa-share:before{content:"\f064"}.fa-share-alt:before{content:"\f1e0"}.fa-share-alt-square:before{content:"\f1e1"}.fa-share-square:before{content:"\f14d"}.fa-shekel-sign:before{content:"\f20b"}.fa-shield-alt:before{content:"\f3ed"}.fa-shield-virus:before{content:"\f96c"}.fa-ship:before{content:"\f21a"}.fa-shipping-fast:before{content:"\f48b"}.fa-shirtsinbulk:before{content:"\f214"}.fa-shoe-prints:before{content:"\f54b"}.fa-shopify:before{content:"\f957"}.fa-shopping-bag:before{content:"\f290"}.fa-shopping-basket:before{content:"\f291"}.fa-shopping-cart:before{content:"\f07a"}.fa-shopware:before{content:"\f5b5"}.fa-shower:before{content:"\f2cc"}.fa-shuttle-van:before{content:"\f5b6"}.fa-sign:before{content:"\f4d9"}.fa-sign-in-alt:before{content:"\f2f6"}.fa-sign-language:before{content:"\f2a7"}.fa-sign-out-alt:before{content:"\f2f5"}.fa-signal:before{content:"\f012"}.fa-signature:before{content:"\f5b7"}.fa-sim-card:before{content:"\f7c4"}.fa-simplybuilt:before{content:"\f215"}.fa-sistrix:before{content:"\f3ee"}.fa-sitemap:before{content:"\f0e8"}.fa-sith:before{content:"\f512"}.fa-skating:before{content:"\f7c5"}.fa-sketch:before{content:"\f7c6"}.fa-skiing:before{content:"\f7c9"}.fa-skiing-nordic:before{content:"\f7ca"}.fa-skull:before{content:"\f54c"}.fa-skull-crossbones:before{content:"\f714"}.fa-skyatlas:before{content:"\f216"}.fa-skype:before{content:"\f17e"}.fa-slack:before{content:"\f198"}.fa-slack-hash:before{content:"\f3ef"}.fa-slash:before{content:"\f715"}.fa-sleigh:before{content:"\f7cc"}.fa-sliders-h:before{content:"\f1de"}.fa-slideshare:before{content:"\f1e7"}.fa-smile:before{content:"\f118"}.fa-smile-beam:before{content:"\f5b8"}.fa-smile-wink:before{content:"\f4da"}.fa-smog:before{content:"\f75f"}.fa-smoking:before{content:"\f48d"}.fa-smoking-ban:before{content:"\f54d"}.fa-sms:before{content:"\f7cd"}.fa-snapchat:before{content:"\f2ab"}.fa-snapchat-ghost:before{content:"\f2ac"}.fa-snapchat-square:before{content:"\f2ad"}.fa-snowboarding:before{content:"\f7ce"}.fa-snowflake:before{content:"\f2dc"}.fa-snowman:before{content:"\f7d0"}.fa-snowplow:before{content:"\f7d2"}.fa-soap:before{content:"\f96e"}.fa-socks:before{content:"\f696"}.fa-solar-panel:before{content:"\f5ba"}.fa-sort:before{content:"\f0dc"}.fa-sort-alpha-down:before{content:"\f15d"}.fa-sort-alpha-down-alt:before{content:"\f881"}.fa-sort-alpha-up:before{content:"\f15e"}.fa-sort-alpha-up-alt:before{content:"\f882"}.fa-sort-amount-down:before{content:"\f160"}.fa-sort-amount-down-alt:before{content:"\f884"}.fa-sort-amount-up:before{content:"\f161"}.fa-sort-amount-up-alt:before{content:"\f885"}.fa-sort-down:before{content:"\f0dd"}.fa-sort-numeric-down:before{content:"\f162"}.fa-sort-numeric-down-alt:before{content:"\f886"}.fa-sort-numeric-up:before{content:"\f163"}.fa-sort-numeric-up-alt:before{content:"\f887"}.fa-sort-up:before{content:"\f0de"}.fa-soundcloud:before{content:"\f1be"}.fa-sourcetree:before{content:"\f7d3"}.fa-spa:before{content:"\f5bb"}.fa-space-shuttle:before{content:"\f197"}.fa-speakap:before{content:"\f3f3"}.fa-speaker-deck:before{content:"\f83c"}.fa-spell-check:before{content:"\f891"}.fa-spider:before{content:"\f717"}.fa-spinner:before{content:"\f110"}.fa-splotch:before{content:"\f5bc"}.fa-spotify:before{content:"\f1bc"}.fa-spray-can:before{content:"\f5bd"}.fa-square:before{content:"\f0c8"}.fa-square-full:before{content:"\f45c"}.fa-square-root-alt:before{content:"\f698"}.fa-squarespace:before{content:"\f5be"}.fa-stack-exchange:before{content:"\f18d"}.fa-stack-overflow:before{content:"\f16c"}.fa-stackpath:before{content:"\f842"}.fa-stamp:before{content:"\f5bf"}.fa-star:before{content:"\f005"}.fa-star-and-crescent:before{content:"\f699"}.fa-star-half:before{content:"\f089"}.fa-star-half-alt:before{content:"\f5c0"}.fa-star-of-david:before{content:"\f69a"}.fa-star-of-life:before{content:"\f621"}.fa-staylinked:before{content:"\f3f5"}.fa-steam:before{content:"\f1b6"}.fa-steam-square:before{content:"\f1b7"}.fa-steam-symbol:before{content:"\f3f6"}.fa-step-backward:before{content:"\f048"}.fa-step-forward:before{content:"\f051"}.fa-stethoscope:before{content:"\f0f1"}.fa-sticker-mule:before{content:"\f3f7"}.fa-sticky-note:before{content:"\f249"}.fa-stop:before{content:"\f04d"}.fa-stop-circle:before{content:"\f28d"}.fa-stopwatch:before{content:"\f2f2"}.fa-stopwatch-20:before{content:"\f96f"}.fa-store:before{content:"\f54e"}.fa-store-alt:before{content:"\f54f"}.fa-store-alt-slash:before{content:"\f970"}.fa-store-slash:before{content:"\f971"}.fa-strava:before{content:"\f428"}.fa-stream:before{content:"\f550"}.fa-street-view:before{content:"\f21d"}.fa-strikethrough:before{content:"\f0cc"}.fa-stripe:before{content:"\f429"}.fa-stripe-s:before{content:"\f42a"}.fa-stroopwafel:before{content:"\f551"}.fa-studiovinari:before{content:"\f3f8"}.fa-stumbleupon:before{content:"\f1a4"}.fa-stumbleupon-circle:before{content:"\f1a3"}.fa-subscript:before{content:"\f12c"}.fa-subway:before{content:"\f239"}.fa-suitcase:before{content:"\f0f2"}.fa-suitcase-rolling:before{content:"\f5c1"}.fa-sun:before{content:"\f185"}.fa-superpowers:before{content:"\f2dd"}.fa-superscript:before{content:"\f12b"}.fa-supple:before{content:"\f3f9"}.fa-surprise:before{content:"\f5c2"}.fa-suse:before{content:"\f7d6"}.fa-swatchbook:before{content:"\f5c3"}.fa-swift:before{content:"\f8e1"}.fa-swimmer:before{content:"\f5c4"}.fa-swimming-pool:before{content:"\f5c5"}.fa-symfony:before{content:"\f83d"}.fa-synagogue:before{content:"\f69b"}.fa-sync:before{content:"\f021"}.fa-sync-alt:before{content:"\f2f1"}.fa-syringe:before{content:"\f48e"}.fa-table:before{content:"\f0ce"}.fa-table-tennis:before{content:"\f45d"}.fa-tablet:before{content:"\f10a"}.fa-tablet-alt:before{content:"\f3fa"}.fa-tablets:before{content:"\f490"}.fa-tachometer-alt:before{content:"\f3fd"}.fa-tag:before{content:"\f02b"}.fa-tags:before{content:"\f02c"}.fa-tape:before{content:"\f4db"}.fa-tasks:before{content:"\f0ae"}.fa-taxi:before{content:"\f1ba"}.fa-teamspeak:before{content:"\f4f9"}.fa-teeth:before{content:"\f62e"}.fa-teeth-open:before{content:"\f62f"}.fa-telegram:before{content:"\f2c6"}.fa-telegram-plane:before{content:"\f3fe"}.fa-temperature-high:before{content:"\f769"}.fa-temperature-low:before{content:"\f76b"}.fa-tencent-weibo:before{content:"\f1d5"}.fa-tenge:before{content:"\f7d7"}.fa-terminal:before{content:"\f120"}.fa-text-height:before{content:"\f034"}.fa-text-width:before{content:"\f035"}.fa-th:before{content:"\f00a"}.fa-th-large:before{content:"\f009"}.fa-th-list:before{content:"\f00b"}.fa-the-red-yeti:before{content:"\f69d"}.fa-theater-masks:before{content:"\f630"}.fa-themeco:before{content:"\f5c6"}.fa-themeisle:before{content:"\f2b2"}.fa-thermometer:before{content:"\f491"}.fa-thermometer-empty:before{content:"\f2cb"}.fa-thermometer-full:before{content:"\f2c7"}.fa-thermometer-half:before{content:"\f2c9"}.fa-thermometer-quarter:before{content:"\f2ca"}.fa-thermometer-three-quarters:before{content:"\f2c8"}.fa-think-peaks:before{content:"\f731"}.fa-thumbs-down:before{content:"\f165"}.fa-thumbs-up:before{content:"\f164"}.fa-thumbtack:before{content:"\f08d"}.fa-ticket-alt:before{content:"\f3ff"}.fa-times:before{content:"\f00d"}.fa-times-circle:before{content:"\f057"}.fa-tint:before{content:"\f043"}.fa-tint-slash:before{content:"\f5c7"}.fa-tired:before{content:"\f5c8"}.fa-toggle-off:before{content:"\f204"}.fa-toggle-on:before{content:"\f205"}.fa-toilet:before{content:"\f7d8"}.fa-toilet-paper:before{content:"\f71e"}.fa-toilet-paper-slash:before{content:"\f972"}.fa-toolbox:before{content:"\f552"}.fa-tools:before{content:"\f7d9"}.fa-tooth:before{content:"\f5c9"}.fa-torah:before{content:"\f6a0"}.fa-torii-gate:before{content:"\f6a1"}.fa-tractor:before{content:"\f722"}.fa-trade-federation:before{content:"\f513"}.fa-trademark:before{content:"\f25c"}.fa-traffic-light:before{content:"\f637"}.fa-trailer:before{content:"\f941"}.fa-train:before{content:"\f238"}.fa-tram:before{content:"\f7da"}.fa-transgender:before{content:"\f224"}.fa-transgender-alt:before{content:"\f225"}.fa-trash:before{content:"\f1f8"}.fa-trash-alt:before{content:"\f2ed"}.fa-trash-restore:before{content:"\f829"}.fa-trash-restore-alt:before{content:"\f82a"}.fa-tree:before{content:"\f1bb"}.fa-trello:before{content:"\f181"}.fa-tripadvisor:before{content:"\f262"}.fa-trophy:before{content:"\f091"}.fa-truck:before{content:"\f0d1"}.fa-truck-loading:before{content:"\f4de"}.fa-truck-monster:before{content:"\f63b"}.fa-truck-moving:before{content:"\f4df"}.fa-truck-pickup:before{content:"\f63c"}.fa-tshirt:before{content:"\f553"}.fa-tty:before{content:"\f1e4"}.fa-tumblr:before{content:"\f173"}.fa-tumblr-square:before{content:"\f174"}.fa-tv:before{content:"\f26c"}.fa-twitch:before{content:"\f1e8"}.fa-twitter:before{content:"\f099"}.fa-twitter-square:before{content:"\f081"}.fa-typo3:before{content:"\f42b"}.fa-uber:before{content:"\f402"}.fa-ubuntu:before{content:"\f7df"}.fa-uikit:before{content:"\f403"}.fa-umbraco:before{content:"\f8e8"}.fa-umbrella:before{content:"\f0e9"}.fa-umbrella-beach:before{content:"\f5ca"}.fa-underline:before{content:"\f0cd"}.fa-undo:before{content:"\f0e2"}.fa-undo-alt:before{content:"\f2ea"}.fa-uniregistry:before{content:"\f404"}.fa-unity:before{content:"\f949"}.fa-universal-access:before{content:"\f29a"}.fa-university:before{content:"\f19c"}.fa-unlink:before{content:"\f127"}.fa-unlock:before{content:"\f09c"}.fa-unlock-alt:before{content:"\f13e"}.fa-untappd:before{content:"\f405"}.fa-upload:before{content:"\f093"}.fa-ups:before{content:"\f7e0"}.fa-usb:before{content:"\f287"}.fa-user:before{content:"\f007"}.fa-user-alt:before{content:"\f406"}.fa-user-alt-slash:before{content:"\f4fa"}.fa-user-astronaut:before{content:"\f4fb"}.fa-user-check:before{content:"\f4fc"}.fa-user-circle:before{content:"\f2bd"}.fa-user-clock:before{content:"\f4fd"}.fa-user-cog:before{content:"\f4fe"}.fa-user-edit:before{content:"\f4ff"}.fa-user-friends:before{content:"\f500"}.fa-user-graduate:before{content:"\f501"}.fa-user-injured:before{content:"\f728"}.fa-user-lock:before{content:"\f502"}.fa-user-md:before{content:"\f0f0"}.fa-user-minus:before{content:"\f503"}.fa-user-ninja:before{content:"\f504"}.fa-user-nurse:before{content:"\f82f"}.fa-user-plus:before{content:"\f234"}.fa-user-secret:before{content:"\f21b"}.fa-user-shield:before{content:"\f505"}.fa-user-slash:before{content:"\f506"}.fa-user-tag:before{content:"\f507"}.fa-user-tie:before{content:"\f508"}.fa-user-times:before{content:"\f235"}.fa-users:before{content:"\f0c0"}.fa-users-cog:before{content:"\f509"}.fa-usps:before{content:"\f7e1"}.fa-ussunnah:before{content:"\f407"}.fa-utensil-spoon:before{content:"\f2e5"}.fa-utensils:before{content:"\f2e7"}.fa-vaadin:before{content:"\f408"}.fa-vector-square:before{content:"\f5cb"}.fa-venus:before{content:"\f221"}.fa-venus-double:before{content:"\f226"}.fa-venus-mars:before{content:"\f228"}.fa-viacoin:before{content:"\f237"}.fa-viadeo:before{content:"\f2a9"}.fa-viadeo-square:before{content:"\f2aa"}.fa-vial:before{content:"\f492"}.fa-vials:before{content:"\f493"}.fa-viber:before{content:"\f409"}.fa-video:before{content:"\f03d"}.fa-video-slash:before{content:"\f4e2"}.fa-vihara:before{content:"\f6a7"}.fa-vimeo:before{content:"\f40a"}.fa-vimeo-square:before{content:"\f194"}.fa-vimeo-v:before{content:"\f27d"}.fa-vine:before{content:"\f1ca"}.fa-virus:before{content:"\f974"}.fa-virus-slash:before{content:"\f975"}.fa-viruses:before{content:"\f976"}.fa-vk:before{content:"\f189"}.fa-vnv:before{content:"\f40b"}.fa-voicemail:before{content:"\f897"}.fa-volleyball-ball:before{content:"\f45f"}.fa-volume-down:before{content:"\f027"}.fa-volume-mute:before{content:"\f6a9"}.fa-volume-off:before{content:"\f026"}.fa-volume-up:before{content:"\f028"}.fa-vote-yea:before{content:"\f772"}.fa-vr-cardboard:before{content:"\f729"}.fa-vuejs:before{content:"\f41f"}.fa-walking:before{content:"\f554"}.fa-wallet:before{content:"\f555"}.fa-warehouse:before{content:"\f494"}.fa-water:before{content:"\f773"}.fa-wave-square:before{content:"\f83e"}.fa-waze:before{content:"\f83f"}.fa-weebly:before{content:"\f5cc"}.fa-weibo:before{content:"\f18a"}.fa-weight:before{content:"\f496"}.fa-weight-hanging:before{content:"\f5cd"}.fa-weixin:before{content:"\f1d7"}.fa-whatsapp:before{content:"\f232"}.fa-whatsapp-square:before{content:"\f40c"}.fa-wheelchair:before{content:"\f193"}.fa-whmcs:before{content:"\f40d"}.fa-wifi:before{content:"\f1eb"}.fa-wikipedia-w:before{content:"\f266"}.fa-wind:before{content:"\f72e"}.fa-window-close:before{content:"\f410"}.fa-window-maximize:before{content:"\f2d0"}.fa-window-minimize:before{content:"\f2d1"}.fa-window-restore:before{content:"\f2d2"}.fa-windows:before{content:"\f17a"}.fa-wine-bottle:before{content:"\f72f"}.fa-wine-glass:before{content:"\f4e3"}.fa-wine-glass-alt:before{content:"\f5ce"}.fa-wix:before{content:"\f5cf"}.fa-wizards-of-the-coast:before{content:"\f730"}.fa-wolf-pack-battalion:before{content:"\f514"}.fa-won-sign:before{content:"\f159"}.fa-wordpress:before{content:"\f19a"}.fa-wordpress-simple:before{content:"\f411"}.fa-wpbeginner:before{content:"\f297"}.fa-wpexplorer:before{content:"\f2de"}.fa-wpforms:before{content:"\f298"}.fa-wpressr:before{content:"\f3e4"}.fa-wrench:before{content:"\f0ad"}.fa-x-ray:before{content:"\f497"}.fa-xbox:before{content:"\f412"}.fa-xing:before{content:"\f168"}.fa-xing-square:before{content:"\f169"}.fa-y-combinator:before{content:"\f23b"}.fa-yahoo:before{content:"\f19e"}.fa-yammer:before{content:"\f840"}.fa-yandex:before{content:"\f413"}.fa-yandex-international:before{content:"\f414"}.fa-yarn:before{content:"\f7e3"}.fa-yelp:before{content:"\f1e9"}.fa-yen-sign:before{content:"\f157"}.fa-yin-yang:before{content:"\f6ad"}.fa-yoast:before{content:"\f2b1"}.fa-youtube:before{content:"\f167"}.fa-youtube-square:before{content:"\f431"}.fa-zhihu:before{content:"\f63f"}.sr-only{border:0;clip:rect(0,0,0,0);height:1px;margin:-1px;overflow:hidden;padding:0;position:absolute;width:1px}.sr-only-focusable:active,.sr-only-focusable:focus{clip:auto;height:auto;margin:0;overflow:visible;position:static;width:auto}@font-face{font-family:"Font Awesome 5 Brands";font-style:normal;font-weight:400;font-display:block;src:url(../webfonts/fa-brands-400.eot);src:url(../webfonts/fa-brands-400.eot?#iefix) format("embedded-opentype"),url(../webfonts/fa-brands-400.woff2) format("woff2"),url(../webfonts/fa-brands-400.woff) format("woff"),url(../webfonts/fa-brands-400.ttf) format("truetype"),url(../webfonts/fa-brands-400.svg#fontawesome) format("svg")}.fab{font-family:"Font Awesome 5 Brands"}@font-face{font-family:"Font Awesome 5 Free";font-style:normal;font-weight:400;font-display:block;src:url(../webfonts/fa-regular-400.eot);src:url(../webfonts/fa-regular-400.eot?#iefix) format("embedded-opentype"),url(../webfonts/fa-regular-400.woff2) format("woff2"),url(../webfonts/fa-regular-400.woff) format("woff"),url(../webfonts/fa-regular-400.ttf) format("truetype"),url(../webfonts/fa-regular-400.svg#fontawesome) format("svg")}.fab,.far{font-weight:400}@font-face{font-family:"Font Awesome 5 Free";font-style:normal;font-weight:900;font-display:block;src:url(../webfonts/fa-solid-900.eot);src:url(../webfonts/fa-solid-900.eot?#iefix) format("embedded-opentype"),url(../webfonts/fa-solid-900.woff2) format("woff2"),url(../webfonts/fa-solid-900.woff) format("woff"),url(../webfonts/fa-solid-900.ttf) format("truetype"),url(../webfonts/fa-solid-900.svg#fontawesome) format("svg")}.fa,.far,.fas{font-family:"Font Awesome 5 Free"}.fa,.fas{font-weight:900}
\ No newline at end of file
diff --git a/docs/build/html/_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.eot b/docs/build/html/_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.eot
new file mode 100644
index 00000000..a1bc094a
Binary files /dev/null and b/docs/build/html/_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.eot differ
diff --git a/docs/build/html/_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.svg b/docs/build/html/_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.svg
new file mode 100644
index 00000000..46ad237a
--- /dev/null
+++ b/docs/build/html/_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.svg
@@ -0,0 +1,3570 @@
+
+
+
+
diff --git a/docs/build/html/_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.ttf b/docs/build/html/_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.ttf
new file mode 100644
index 00000000..948a2a6c
Binary files /dev/null and b/docs/build/html/_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.ttf differ
diff --git a/docs/build/html/_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff b/docs/build/html/_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff
new file mode 100644
index 00000000..2a89d521
Binary files /dev/null and b/docs/build/html/_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff differ
diff --git a/docs/build/html/_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff2 b/docs/build/html/_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff2
new file mode 100644
index 00000000..141a90a9
Binary files /dev/null and b/docs/build/html/_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff2 differ
diff --git a/docs/build/html/_static/vendor/fontawesome/5.13.0/webfonts/fa-regular-400.eot b/docs/build/html/_static/vendor/fontawesome/5.13.0/webfonts/fa-regular-400.eot
new file mode 100644
index 00000000..38cf2517
Binary files /dev/null and b/docs/build/html/_static/vendor/fontawesome/5.13.0/webfonts/fa-regular-400.eot differ
diff --git a/docs/build/html/_static/vendor/fontawesome/5.13.0/webfonts/fa-regular-400.svg b/docs/build/html/_static/vendor/fontawesome/5.13.0/webfonts/fa-regular-400.svg
new file mode 100644
index 00000000..48634a9a
--- /dev/null
+++ b/docs/build/html/_static/vendor/fontawesome/5.13.0/webfonts/fa-regular-400.svg
@@ -0,0 +1,803 @@
+
+
+
+
diff --git a/docs/build/html/_static/vendor/fontawesome/5.13.0/webfonts/fa-regular-400.ttf b/docs/build/html/_static/vendor/fontawesome/5.13.0/webfonts/fa-regular-400.ttf
new file mode 100644
index 00000000..abe99e20
Binary files /dev/null and b/docs/build/html/_static/vendor/fontawesome/5.13.0/webfonts/fa-regular-400.ttf differ
diff --git a/docs/build/html/_static/vendor/fontawesome/5.13.0/webfonts/fa-regular-400.woff b/docs/build/html/_static/vendor/fontawesome/5.13.0/webfonts/fa-regular-400.woff
new file mode 100644
index 00000000..24de566a
Binary files /dev/null and b/docs/build/html/_static/vendor/fontawesome/5.13.0/webfonts/fa-regular-400.woff differ
diff --git a/docs/build/html/_static/vendor/fontawesome/5.13.0/webfonts/fa-regular-400.woff2 b/docs/build/html/_static/vendor/fontawesome/5.13.0/webfonts/fa-regular-400.woff2
new file mode 100644
index 00000000..7e0118e5
Binary files /dev/null and b/docs/build/html/_static/vendor/fontawesome/5.13.0/webfonts/fa-regular-400.woff2 differ
diff --git a/docs/build/html/_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.eot b/docs/build/html/_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.eot
new file mode 100644
index 00000000..d3b77c22
Binary files /dev/null and b/docs/build/html/_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.eot differ
diff --git a/docs/build/html/_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.svg b/docs/build/html/_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.svg
new file mode 100644
index 00000000..7742838b
--- /dev/null
+++ b/docs/build/html/_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.svg
@@ -0,0 +1,4938 @@
+
+
+
+
diff --git a/docs/build/html/_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.ttf b/docs/build/html/_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.ttf
new file mode 100644
index 00000000..5b979039
Binary files /dev/null and b/docs/build/html/_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.ttf differ
diff --git a/docs/build/html/_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff b/docs/build/html/_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff
new file mode 100644
index 00000000..beec7917
Binary files /dev/null and b/docs/build/html/_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff differ
diff --git a/docs/build/html/_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff2 b/docs/build/html/_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff2
new file mode 100644
index 00000000..978a681a
Binary files /dev/null and b/docs/build/html/_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff2 differ
diff --git a/docs/build/html/_static/webpack-macros.html b/docs/build/html/_static/webpack-macros.html
new file mode 100644
index 00000000..b51c16ef
--- /dev/null
+++ b/docs/build/html/_static/webpack-macros.html
@@ -0,0 +1,29 @@
+
+{# Load FontAwesome icons #}
+{% macro head_pre_icons() %}
+
+
+
+{% endmacro %}
+
+{% macro head_pre_assets() %}
+
+
+
+{% endmacro %}
+
+{% macro head_js_preload() %}
+
+
+{% endmacro %}
+
+{% macro body_post() %}
+
+
+{% endmacro %}
\ No newline at end of file
diff --git a/docs/build/html/api.html b/docs/build/html/api.html
new file mode 100644
index 00000000..4a27eb00
--- /dev/null
+++ b/docs/build/html/api.html
@@ -0,0 +1,480 @@
+
+
+
+
+
+
+
+
+ API — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/cli/cli.html b/docs/build/html/cli/cli.html
new file mode 100644
index 00000000..0a647f6b
--- /dev/null
+++ b/docs/build/html/cli/cli.html
@@ -0,0 +1,529 @@
+
+
+
+
+
+
+
+
+ Command Line Interface — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Command Line Interface
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+Command Line Interface
+The command line interface is a simple wrapper around the library.
+It provides the two main functionalities:
+
+download - Downloads samples of either Domain Record or HTML from common crawl indexes
+extract - Downloads an HTML from Domain Record and extracts the content. It can also directly take the HTML and extract the data.
+
+Both functionalities are invoked using `cmon`
followed by the functionality and the required arguments.
+
+Examples
+# Download first 1000 domain records for example.com
+cmon download --match_type=domain --limit=1000 example.com dr_output record
+
+# Download first 100 htmls for example.com
+cmon download --match_type=domain --limit=100 example.com html_output html
+
+# Take the domain records downloaded using the first command and extracts them using your extractors
+cmon extract config.json extracted_output dr_output/*/*.jsonl record
+
+# Take the htmls downloaded using the second command and extracts them using your extractors
+cmon extract config.json extracted_output html_output/*/*.html html
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/cli/download.html b/docs/build/html/cli/download.html
new file mode 100644
index 00000000..015e4388
--- /dev/null
+++ b/docs/build/html/cli/download.html
@@ -0,0 +1,605 @@
+
+
+
+
+
+
+
+
+ Command Line Download — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Command Line Download
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+Command Line Download
+The download mode of the `cmon`
command line tool servers to query and download from CommonCrawl indexes.
+The following arguments are needed in this order:
+
+Positional arguments
+
+url - URL to query.
+output - Path to output directory.
+{record,html} - Download mode:
+
+
+
+In html mode, the output directory will contain .html files, one
+for each found URL. In record mode, the output directory will contain
+`.jsonl`
files, each containing multiple domain records in JSON format.
+
+
+Options
+
+- --limit LIMIT
+Max number of URLs to download.
+
+- --since SINCE
+Start date in ISO format (e.g., 2020-01-01).
+
+- --to TO
+End date in ISO format (e.g., 2020-01-01).
+
+- --cc_server CC_SERVER
+Common Crawl indexes to query. Must provide the whole URL (e.g., https://index.commoncrawl.org/CC-MAIN-2023-14-index).
+
+- --max_retry MAX_RETRY
+Max number of retries for a request. Increase this number when requests are failing.
+
+- --sleep_step SLEEP_STEP
+Number of additional seconds to add to the sleep time between each failed download attempt. Increase this number if the server tells you to slow down.
+
+- --match_type MATCH_TYPE
+One of exact, prefix, host, domain
+Match type for the URL. Refer to cdx-api for more information.
+
+- --max_directory_size MAX_DIRECTORY_SIZE
+Max number of files per directory.
+
+- --filter_non_200
+Filter out non-200 status code.
+
+
+
+
+Record mode options
+
+- --max_crawls_per_file MAX_CRAWLS_PER_FILE
+Max number of domain records per file output
+
+
+
+
+Examples
+# Download first 1000 domain records for example.com
+cmon download --match_type=domain --limit=1000 example.com dr_output record
+
+# Download first 100 htmls for example.com
+cmon download --match_type=domain --limit=100 example.com html_output html
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/cli/extract.html b/docs/build/html/cli/extract.html
new file mode 100644
index 00000000..ece6a86a
--- /dev/null
+++ b/docs/build/html/cli/extract.html
@@ -0,0 +1,621 @@
+
+
+
+
+
+
+
+
+ Command line Extract — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Command line Extract
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+Command line Extract
+The extract mode of the `cmon`
command line tool servers to extract your download files.
+The following arguments are needed in this order:
+
+Positional arguments
+
+config_path - Path to config file containing extraction rules.
+output_path - Path to output directory.
+files - Files to extract data from.
+{record,html} - Extraction mode:
+
+
+
+To create a config file, see Extractor config file.
+Both modes yield the same output format, which is a `.jsonl`
file containing the extracted data,
+one per line. For each file a new directory is created in the output directory, named after the
+file.
+The files created by the download mode, can be directly used with appropriate mode
+in the extraction. If you have an html file, you can use the html mode to extract it.
+If you have a domain records, which you got some other way (AWS Athena), please refer to Domain Record JSONL format,
+which describes how to create `.jsonl`
files from your domain records, which you can then
+use with the record mode.
+
+
+Optional arguments
+
+- --max_crawls_per_file MAX_CRAWLS_PER_FILE
+Max number of extractions per file output.
+
+- --max_directory_size MAX_DIRECTORY_SIZE
+Max number of extraction files per directory.
+
+- --n_proc N_PROC
+Number of processes to use for extraction. The paralelization is on file level,
+thus for single file it’s useless to use more than one process.
+
+
+
+
+Record arguments
+
+- --max_retry MAX_RETRY
+Max number of WARC download attempts.
+
+- --sleep_step SLEEP_STEP
+Number of additional seconds to add to the sleep time between each failed download attempt.
+
+
+
+
+Html arguments
+
+- --date DATE
+Date of extraction of HTML files in ISO format (e.g., 2021-01-01). The default is today.
+
+- --url URL
+URL from which the HTML files were downloaded. By default, it will try to infer from the file content.
+
+
+
+
+Examples
+# Take the domain records downloaded using the first command and extracts them using your extractors
+cmon extract config.json extracted_output dr_output/*/*.jsonl record --max_retry 100 --sleep_step 10
+
+# Take the htmls downloaded using the second command and extracts them using your extractors
+cmon extract config.json extracted_output html_output/*/*.html html --date 2021-01-01 --url https://www.example.com
+
+
+When you are going to build the extractors, you gonna appreaciate that you can specify
+what the url of the html file is and what the date of the extraction is. This is because
+those information are used during the extractor routing.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/cli/index.html b/docs/build/html/cli/index.html
new file mode 100644
index 00000000..48d9223b
--- /dev/null
+++ b/docs/build/html/cli/index.html
@@ -0,0 +1,493 @@
+
+
+
+
+
+
+
+
+ Command Line Interface — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Command Line Interface
+
+
+
+
+
+
+
+
+Command Line Interface
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/extraction/config_file.html b/docs/build/html/extraction/config_file.html
new file mode 100644
index 00000000..44c562ea
--- /dev/null
+++ b/docs/build/html/extraction/config_file.html
@@ -0,0 +1,656 @@
+
+
+
+
+
+
+
+
+ Extractor config file — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Extractor config file
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/extraction/creating_extractor.html b/docs/build/html/extraction/creating_extractor.html
new file mode 100644
index 00000000..40534558
--- /dev/null
+++ b/docs/build/html/extraction/creating_extractor.html
@@ -0,0 +1,590 @@
+
+
+
+
+
+
+
+
+ Custom Extractor — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Custom Extractor
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/extraction/index.html b/docs/build/html/extraction/index.html
new file mode 100644
index 00000000..6783939b
--- /dev/null
+++ b/docs/build/html/extraction/index.html
@@ -0,0 +1,493 @@
+
+
+
+
+
+
+
+
+ Extraction — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/extraction/utils.html b/docs/build/html/extraction/utils.html
new file mode 100644
index 00000000..bf0bb475
--- /dev/null
+++ b/docs/build/html/extraction/utils.html
@@ -0,0 +1,537 @@
+
+
+
+
+
+
+
+
+ Extraction utils — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Extraction utils
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.aggregator.html b/docs/build/html/generated/cmoncrawl.aggregator.html
new file mode 100644
index 00000000..17ef94ff
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.aggregator.html
@@ -0,0 +1,484 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.aggregator — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.aggregator
+
+
+
+
+
+
+
+
+cmoncrawl.aggregator
+Modules
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.aggregator.index_query.IndexAggregator.__init__.html b/docs/build/html/generated/cmoncrawl.aggregator.index_query.IndexAggregator.__init__.html
new file mode 100644
index 00000000..d5becc93
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.aggregator.index_query.IndexAggregator.__init__.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.aggregator.index_query.IndexAggregator.__init__ — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.aggregator.index_query.IndexAggregator.__init__
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.aggregator.index_query.IndexAggregator.__init__
+
+-
+IndexAggregator.__init__(domains: List[str], cc_indexes_server: str = 'http://index.commoncrawl.org/collinfo.json', cc_servers: List[str] = [], since: datetime = datetime.datetime(1, 1, 1, 0, 0), to: datetime = datetime.datetime(9999, 12, 31, 23, 59, 59, 999999), limit: Optional[int] = None, max_retry: int = 5, prefetch_size: int = 3, sleep_step: int = 20) → None
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.aggregator.index_query.IndexAggregator.aclose.html b/docs/build/html/generated/cmoncrawl.aggregator.index_query.IndexAggregator.aclose.html
new file mode 100644
index 00000000..f32ac167
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.aggregator.index_query.IndexAggregator.aclose.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.aggregator.index_query.IndexAggregator.aclose — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.aggregator.index_query.IndexAggregator.aclose
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.aggregator.index_query.IndexAggregator.aclose
+
+-
+async IndexAggregator.aclose(exc_type: Optional[Type[BaseException]], exc_val: BaseException | None, exc_tb: Optional[TracebackType] = None) → IndexAggregator
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.aggregator.index_query.IndexAggregator.aopen.html b/docs/build/html/generated/cmoncrawl.aggregator.index_query.IndexAggregator.aopen.html
new file mode 100644
index 00000000..04a33e93
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.aggregator.index_query.IndexAggregator.aopen.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.aggregator.index_query.IndexAggregator.aopen — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.aggregator.index_query.IndexAggregator.aopen
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.aggregator.index_query.IndexAggregator.aopen
+
+-
+async IndexAggregator.aopen() → IndexAggregator
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.aggregator.index_query.IndexAggregator.get_all_CC_indexes.html b/docs/build/html/generated/cmoncrawl.aggregator.index_query.IndexAggregator.get_all_CC_indexes.html
new file mode 100644
index 00000000..7d6efd45
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.aggregator.index_query.IndexAggregator.get_all_CC_indexes.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.aggregator.index_query.IndexAggregator.get_all_CC_indexes — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.aggregator.index_query.IndexAggregator.get_all_CC_indexes
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.aggregator.index_query.IndexAggregator.get_all_CC_indexes
+
+-
+async static IndexAggregator.get_all_CC_indexes(client: ClientSession, cdx_server: str) → List[str]
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.aggregator.index_query.IndexAggregator.get_captured_responses.html b/docs/build/html/generated/cmoncrawl.aggregator.index_query.IndexAggregator.get_captured_responses.html
new file mode 100644
index 00000000..8c70d78d
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.aggregator.index_query.IndexAggregator.get_captured_responses.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.aggregator.index_query.IndexAggregator.get_captured_responses — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.aggregator.index_query.IndexAggregator.get_captured_responses
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.aggregator.index_query.IndexAggregator.get_captured_responses
+
+-
+async static IndexAggregator.get_captured_responses(client: ClientSession, cdx_server: str, domain: str, max_retry: int, sleep_step: int, page: int, since: datetime = datetime.datetime(1, 1, 1, 0, 0), to: datetime = datetime.datetime(9999, 12, 31, 23, 59, 59, 999999))
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.aggregator.index_query.IndexAggregator.get_number_of_pages.html b/docs/build/html/generated/cmoncrawl.aggregator.index_query.IndexAggregator.get_number_of_pages.html
new file mode 100644
index 00000000..be5e9c38
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.aggregator.index_query.IndexAggregator.get_number_of_pages.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.aggregator.index_query.IndexAggregator.get_number_of_pages — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.aggregator.index_query.IndexAggregator.get_number_of_pages
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.aggregator.index_query.IndexAggregator.get_number_of_pages
+
+-
+async static IndexAggregator.get_number_of_pages(client: ClientSession, cdx_server: str, domain: str, max_retry: int, sleep_step: int, page_size: Optional[int] = None)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.aggregator.index_query.IndexAggregator.html b/docs/build/html/generated/cmoncrawl.aggregator.index_query.IndexAggregator.html
new file mode 100644
index 00000000..a1769074
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.aggregator.index_query.IndexAggregator.html
@@ -0,0 +1,573 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.aggregator.index_query.IndexAggregator — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.aggregator.index_query.IndexAggregator
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.aggregator.index_query.IndexAggregator
+
+-
+class cmoncrawl.aggregator.index_query.IndexAggregator(domains: List[str], cc_indexes_server: str = 'http://index.commoncrawl.org/collinfo.json', cc_servers: List[str] = [], since: datetime = datetime.datetime(1, 1, 1, 0, 0), to: datetime = datetime.datetime(9999, 12, 31, 23, 59, 59, 999999), limit: Optional[int] = None, max_retry: int = 5, prefetch_size: int = 3, sleep_step: int = 20)
+
+-
+__init__(domains: List[str], cc_indexes_server: str = 'http://index.commoncrawl.org/collinfo.json', cc_servers: List[str] = [], since: datetime = datetime.datetime(1, 1, 1, 0, 0), to: datetime = datetime.datetime(9999, 12, 31, 23, 59, 59, 999999), limit: Optional[int] = None, max_retry: int = 5, prefetch_size: int = 3, sleep_step: int = 20) → None
+
+
+Methods
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.aggregator.index_query.html b/docs/build/html/generated/cmoncrawl.aggregator.index_query.html
new file mode 100644
index 00000000..50a2ad17
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.aggregator.index_query.html
@@ -0,0 +1,499 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.aggregator.index_query — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.aggregator.index_query
+
+
+
+
+
+
+
+
+cmoncrawl.aggregator.index_query
+Functions
+
+
+
+
+
+
+crawl_to_year (crawl)
|
+ |
+
+timestamp_to_datetime (timestamp)
|
+ |
+
+to_timestamp_format (date)
|
+ |
+
+
+
+Classes
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.aggregator.utils.helpers.html b/docs/build/html/generated/cmoncrawl.aggregator.utils.helpers.html
new file mode 100644
index 00000000..6731d0cc
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.aggregator.utils.helpers.html
@@ -0,0 +1,481 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.aggregator.utils.helpers — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.aggregator.utils.helpers
+
+
+
+
+
+
+
+
+cmoncrawl.aggregator.utils.helpers
+Functions
+
+
+
+
+
+
+unify_url_id (url)
|
+ |
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.aggregator.utils.html b/docs/build/html/generated/cmoncrawl.aggregator.utils.html
new file mode 100644
index 00000000..205b12d2
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.aggregator.utils.html
@@ -0,0 +1,484 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.aggregator.utils — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.aggregator.utils
+
+
+
+
+
+
+
+
+cmoncrawl.aggregator.utils
+Modules
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.aggregator.utils.ndjson_decoder.Decoder.__init__.html b/docs/build/html/generated/cmoncrawl.aggregator.utils.ndjson_decoder.Decoder.__init__.html
new file mode 100644
index 00000000..79f3e82d
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.aggregator.utils.ndjson_decoder.Decoder.__init__.html
@@ -0,0 +1,545 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.aggregator.utils.ndjson_decoder.Decoder.__init__ — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.aggregator.utils.ndjson_decoder.Decoder.__init__
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.aggregator.utils.ndjson_decoder.Decoder.__init__
+
+-
+Decoder.__init__(*, object_hook=None, parse_float=None, parse_int=None, parse_constant=None, strict=True, object_pairs_hook=None)
+object_hook
, if specified, will be called with the result
+of every JSON object decoded and its return value will be used in
+place of the given dict
. This can be used to provide custom
+deserializations (e.g. to support JSON-RPC class hinting).
+object_pairs_hook
, if specified will be called with the result of
+every JSON object decoded with an ordered list of pairs. The return
+value of object_pairs_hook
will be used instead of the dict
.
+This feature can be used to implement custom decoders.
+If object_hook
is also defined, the object_pairs_hook
takes
+priority.
+parse_float
, if specified, will be called with the string
+of every JSON float to be decoded. By default this is equivalent to
+float(num_str). This can be used to use another datatype or parser
+for JSON floats (e.g. decimal.Decimal).
+parse_int
, if specified, will be called with the string
+of every JSON int to be decoded. By default this is equivalent to
+int(num_str). This can be used to use another datatype or parser
+for JSON integers (e.g. float).
+parse_constant
, if specified, will be called with one of the
+following strings: -Infinity, Infinity, NaN.
+This can be used to raise an exception if invalid JSON numbers
+are encountered.
+If strict
is false (true is the default), then control
+characters will be allowed inside strings. Control characters in
+this context are those with character codes in the 0-31 range,
+including '\t'
(tab), '\n'
, '\r'
and '\0'
.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.aggregator.utils.ndjson_decoder.Decoder.decode.html b/docs/build/html/generated/cmoncrawl.aggregator.utils.ndjson_decoder.Decoder.decode.html
new file mode 100644
index 00000000..efa625df
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.aggregator.utils.ndjson_decoder.Decoder.decode.html
@@ -0,0 +1,521 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.aggregator.utils.ndjson_decoder.Decoder.decode — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.aggregator.utils.ndjson_decoder.Decoder.decode
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.aggregator.utils.ndjson_decoder.Decoder.decode
+
+-
+Decoder.decode(s: str, *args, **kwargs)
+Return the Python representation of s
(a str
instance
+containing a JSON document).
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.aggregator.utils.ndjson_decoder.Decoder.html b/docs/build/html/generated/cmoncrawl.aggregator.utils.ndjson_decoder.Decoder.html
new file mode 100644
index 00000000..ff13ec24
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.aggregator.utils.ndjson_decoder.Decoder.html
@@ -0,0 +1,590 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.aggregator.utils.ndjson_decoder.Decoder — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.aggregator.utils.ndjson_decoder.Decoder
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.aggregator.utils.ndjson_decoder.Decoder
+
+-
+class cmoncrawl.aggregator.utils.ndjson_decoder.Decoder(*, object_hook=None, parse_float=None, parse_int=None, parse_constant=None, strict=True, object_pairs_hook=None)
+
+-
+__init__(*, object_hook=None, parse_float=None, parse_int=None, parse_constant=None, strict=True, object_pairs_hook=None)
+object_hook
, if specified, will be called with the result
+of every JSON object decoded and its return value will be used in
+place of the given dict
. This can be used to provide custom
+deserializations (e.g. to support JSON-RPC class hinting).
+object_pairs_hook
, if specified will be called with the result of
+every JSON object decoded with an ordered list of pairs. The return
+value of object_pairs_hook
will be used instead of the dict
.
+This feature can be used to implement custom decoders.
+If object_hook
is also defined, the object_pairs_hook
takes
+priority.
+parse_float
, if specified, will be called with the string
+of every JSON float to be decoded. By default this is equivalent to
+float(num_str). This can be used to use another datatype or parser
+for JSON floats (e.g. decimal.Decimal).
+parse_int
, if specified, will be called with the string
+of every JSON int to be decoded. By default this is equivalent to
+int(num_str). This can be used to use another datatype or parser
+for JSON integers (e.g. float).
+parse_constant
, if specified, will be called with one of the
+following strings: -Infinity, Infinity, NaN.
+This can be used to raise an exception if invalid JSON numbers
+are encountered.
+If strict
is false (true is the default), then control
+characters will be allowed inside strings. Control characters in
+this context are those with character codes in the 0-31 range,
+including '\t'
(tab), '\n'
, '\r'
and '\0'
.
+
+
+Methods
+
+
+
+
+
+
+__init__ (*[, object_hook, parse_float, ...])
|
+object_hook , if specified, will be called with the result of every JSON object decoded and its return value will be used in place of the given dict .
|
+
+decode (s, *args, **kwargs)
|
+Return the Python representation of s (a str instance containing a JSON document). |
+
+raw_decode (s[, idx])
|
+Decode a JSON document from s (a str beginning with a JSON document) and return a 2-tuple of the Python representation and the index in s where the document ended. |
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.aggregator.utils.ndjson_decoder.Decoder.raw_decode.html b/docs/build/html/generated/cmoncrawl.aggregator.utils.ndjson_decoder.Decoder.raw_decode.html
new file mode 100644
index 00000000..ebd04aed
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.aggregator.utils.ndjson_decoder.Decoder.raw_decode.html
@@ -0,0 +1,524 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.aggregator.utils.ndjson_decoder.Decoder.raw_decode — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.aggregator.utils.ndjson_decoder.Decoder.raw_decode
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.aggregator.utils.ndjson_decoder.Decoder.raw_decode
+
+-
+Decoder.raw_decode(s, idx=0)
+Decode a JSON document from s
(a str
beginning with
+a JSON document) and return a 2-tuple of the Python
+representation and the index in s
where the document ended.
+This can be used to decode a JSON document from a string that may
+have extraneous data at the end.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.aggregator.utils.ndjson_decoder.html b/docs/build/html/generated/cmoncrawl.aggregator.utils.ndjson_decoder.html
new file mode 100644
index 00000000..7f7dc581
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.aggregator.utils.ndjson_decoder.html
@@ -0,0 +1,481 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.aggregator.utils.ndjson_decoder — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.aggregator.utils.ndjson_decoder
+
+
+
+
+
+
+
+
+cmoncrawl.aggregator.utils.ndjson_decoder
+Classes
+
+
+
+
+
+
+Decoder (*[, object_hook, parse_float, ...])
|
+ |
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.common.html b/docs/build/html/generated/cmoncrawl.common.html
new file mode 100644
index 00000000..338fb178
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.common.html
@@ -0,0 +1,484 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.common — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.common
+Modules
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.common.loggers.html b/docs/build/html/generated/cmoncrawl.common.loggers.html
new file mode 100644
index 00000000..f23441f8
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.common.loggers.html
@@ -0,0 +1,469 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.common.loggers — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.common.loggers
+
+
+
+
+
+
+
+
+cmoncrawl.common.loggers
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.common.types.DomainCrawl.__init__.html b/docs/build/html/generated/cmoncrawl.common.types.DomainCrawl.__init__.html
new file mode 100644
index 00000000..b9a0241f
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.common.types.DomainCrawl.__init__.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.common.types.DomainCrawl.__init__ — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.common.types.DomainCrawl.__init__
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.common.types.DomainCrawl.__init__
+
+-
+DomainCrawl.__init__(domain: str = '', cdx_server: str = '', page: int = 0) → None
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.common.types.DomainCrawl.html b/docs/build/html/generated/cmoncrawl.common.types.DomainCrawl.html
new file mode 100644
index 00000000..946f4997
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.common.types.DomainCrawl.html
@@ -0,0 +1,576 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.common.types.DomainCrawl — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.common.types.DomainCrawl
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.common.types.DomainCrawl
+
+-
+class cmoncrawl.common.types.DomainCrawl(domain: str = '', cdx_server: str = '', page: int = 0)
+
+-
+__init__(domain: str = '', cdx_server: str = '', page: int = 0) → None
+
+
+Methods
+
+
+
+
+
+
+__init__ ([domain, cdx_server, page])
|
+ |
+
+
+
+Attributes
+
+
+
+
+
+
+cdx_server
|
+ |
+
+domain
|
+ |
+
+page
|
+ |
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.common.types.DomainRecord.__init__.html b/docs/build/html/generated/cmoncrawl.common.types.DomainRecord.__init__.html
new file mode 100644
index 00000000..508f1dd0
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.common.types.DomainRecord.__init__.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.common.types.DomainRecord.__init__ — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.common.types.DomainRecord.__init__
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.common.types.DomainRecord.__init__
+
+-
+DomainRecord.__init__(filename: str, url: str | None, offset: int, length: int, digest: Optional[str] = None, encoding: Optional[str] = None, timestamp: Optional[datetime] = None) → None
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.common.types.DomainRecord.from_dict.html b/docs/build/html/generated/cmoncrawl.common.types.DomainRecord.from_dict.html
new file mode 100644
index 00000000..fa3ee73a
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.common.types.DomainRecord.from_dict.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.common.types.DomainRecord.from_dict — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.common.types.DomainRecord.from_dict
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.common.types.DomainRecord.from_dict
+
+-
+classmethod DomainRecord.from_dict(kvs: Optional[Union[dict, list, str, int, float, bool]], *, infer_missing=False) → A
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.common.types.DomainRecord.from_json.html b/docs/build/html/generated/cmoncrawl.common.types.DomainRecord.from_json.html
new file mode 100644
index 00000000..77474be2
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.common.types.DomainRecord.from_json.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.common.types.DomainRecord.from_json — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.common.types.DomainRecord.from_json
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.common.types.DomainRecord.from_json
+
+-
+classmethod DomainRecord.from_json(s: Union[str, bytes, bytearray], *, parse_float=None, parse_int=None, parse_constant=None, infer_missing=False, **kw) → A
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.common.types.DomainRecord.html b/docs/build/html/generated/cmoncrawl.common.types.DomainRecord.html
new file mode 100644
index 00000000..ee52ad63
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.common.types.DomainRecord.html
@@ -0,0 +1,603 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.common.types.DomainRecord — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.common.types.DomainRecord
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.common.types.DomainRecord
+
+-
+class cmoncrawl.common.types.DomainRecord(filename: str, url: str | None, offset: int, length: int, digest: str | None = None, encoding: str | None = None, timestamp: datetime.datetime | None = None)
+
+-
+__init__(filename: str, url: str | None, offset: int, length: int, digest: Optional[str] = None, encoding: Optional[str] = None, timestamp: Optional[datetime] = None) → None
+
+
+Methods
+
+
+
+
+
+
+__init__ (filename, url, offset, length[, ...])
|
+ |
+
+from_dict (kvs, *[, infer_missing])
|
+ |
+
+from_json (s, *[, parse_float, parse_int, ...])
|
+ |
+
+schema (*[, infer_missing, only, exclude, ...])
|
+ |
+
+to_dict ([encode_json])
|
+ |
+
+to_json (*[, skipkeys, ensure_ascii, ...])
|
+ |
+
+
+
+Attributes
+
+
+
+
+
+
+digest
|
+ |
+
+encoding
|
+ |
+
+timestamp
|
+ |
+
+filename
|
+ |
+
+url
|
+ |
+
+offset
|
+ |
+
+length
|
+ |
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.common.types.DomainRecord.schema.html b/docs/build/html/generated/cmoncrawl.common.types.DomainRecord.schema.html
new file mode 100644
index 00000000..37c71eea
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.common.types.DomainRecord.schema.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.common.types.DomainRecord.schema — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.common.types.DomainRecord.schema
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.common.types.DomainRecord.schema
+
+-
+classmethod DomainRecord.schema(*, infer_missing: bool = False, only=None, exclude=(), many: bool = False, context=None, load_only=(), dump_only=(), partial: bool = False, unknown=None) → SchemaF[A]
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.common.types.DomainRecord.to_dict.html b/docs/build/html/generated/cmoncrawl.common.types.DomainRecord.to_dict.html
new file mode 100644
index 00000000..f583c047
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.common.types.DomainRecord.to_dict.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.common.types.DomainRecord.to_dict — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.common.types.DomainRecord.to_dict
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.common.types.DomainRecord.to_dict
+
+-
+DomainRecord.to_dict(encode_json=False) → Dict[str, Optional[Union[dict, list, str, int, float, bool]]]
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.common.types.DomainRecord.to_json.html b/docs/build/html/generated/cmoncrawl.common.types.DomainRecord.to_json.html
new file mode 100644
index 00000000..93a25bea
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.common.types.DomainRecord.to_json.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.common.types.DomainRecord.to_json — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.common.types.DomainRecord.to_json
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.common.types.DomainRecord.to_json
+
+-
+DomainRecord.to_json(*, skipkeys: bool = False, ensure_ascii: bool = True, check_circular: bool = True, allow_nan: bool = True, indent: Optional[Union[int, str]] = None, separators: Optional[Tuple[str, str]] = None, default: Optional[Callable] = None, sort_keys: bool = False, **kw) → str
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.common.types.ExtractConfig.__init__.html b/docs/build/html/generated/cmoncrawl.common.types.ExtractConfig.__init__.html
new file mode 100644
index 00000000..b6d4d3e0
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.common.types.ExtractConfig.__init__.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.common.types.ExtractConfig.__init__ — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.common.types.ExtractConfig.__init__
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.common.types.ExtractConfig.from_dict.html b/docs/build/html/generated/cmoncrawl.common.types.ExtractConfig.from_dict.html
new file mode 100644
index 00000000..c7dc7ed8
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.common.types.ExtractConfig.from_dict.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.common.types.ExtractConfig.from_dict — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.common.types.ExtractConfig.from_dict
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.common.types.ExtractConfig.from_json.html b/docs/build/html/generated/cmoncrawl.common.types.ExtractConfig.from_json.html
new file mode 100644
index 00000000..d63f6b5f
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.common.types.ExtractConfig.from_json.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.common.types.ExtractConfig.from_json — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.common.types.ExtractConfig.from_json
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.common.types.ExtractConfig.html b/docs/build/html/generated/cmoncrawl.common.types.ExtractConfig.html
new file mode 100644
index 00000000..0c80a102
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.common.types.ExtractConfig.html
@@ -0,0 +1,589 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.common.types.ExtractConfig — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.common.types.ExtractConfig
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.common.types.ExtractConfig.schema.html b/docs/build/html/generated/cmoncrawl.common.types.ExtractConfig.schema.html
new file mode 100644
index 00000000..88476c16
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.common.types.ExtractConfig.schema.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.common.types.ExtractConfig.schema — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.common.types.ExtractConfig.schema
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.common.types.ExtractConfig.to_dict.html b/docs/build/html/generated/cmoncrawl.common.types.ExtractConfig.to_dict.html
new file mode 100644
index 00000000..5d1e41fa
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.common.types.ExtractConfig.to_dict.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.common.types.ExtractConfig.to_dict — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.common.types.ExtractConfig.to_dict
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.common.types.ExtractConfig.to_json.html b/docs/build/html/generated/cmoncrawl.common.types.ExtractConfig.to_json.html
new file mode 100644
index 00000000..07b7603a
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.common.types.ExtractConfig.to_json.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.common.types.ExtractConfig.to_json — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.common.types.ExtractConfig.to_json
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.common.types.ExtractorConfig.__init__.html b/docs/build/html/generated/cmoncrawl.common.types.ExtractorConfig.__init__.html
new file mode 100644
index 00000000..ec05dbff
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.common.types.ExtractorConfig.__init__.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.common.types.ExtractorConfig.__init__ — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.common.types.ExtractorConfig.__init__
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.common.types.ExtractorConfig.from_dict.html b/docs/build/html/generated/cmoncrawl.common.types.ExtractorConfig.from_dict.html
new file mode 100644
index 00000000..cbec99df
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.common.types.ExtractorConfig.from_dict.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.common.types.ExtractorConfig.from_dict — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.common.types.ExtractorConfig.from_dict
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.common.types.ExtractorConfig.from_json.html b/docs/build/html/generated/cmoncrawl.common.types.ExtractorConfig.from_json.html
new file mode 100644
index 00000000..99d182ea
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.common.types.ExtractorConfig.from_json.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.common.types.ExtractorConfig.from_json — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.common.types.ExtractorConfig.from_json
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.common.types.ExtractorConfig.html b/docs/build/html/generated/cmoncrawl.common.types.ExtractorConfig.html
new file mode 100644
index 00000000..735e206c
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.common.types.ExtractorConfig.html
@@ -0,0 +1,592 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.common.types.ExtractorConfig — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.common.types.ExtractorConfig
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.common.types.ExtractorConfig.schema.html b/docs/build/html/generated/cmoncrawl.common.types.ExtractorConfig.schema.html
new file mode 100644
index 00000000..4119ce4e
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.common.types.ExtractorConfig.schema.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.common.types.ExtractorConfig.schema — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.common.types.ExtractorConfig.schema
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.common.types.ExtractorConfig.to_dict.html b/docs/build/html/generated/cmoncrawl.common.types.ExtractorConfig.to_dict.html
new file mode 100644
index 00000000..c8da41e8
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.common.types.ExtractorConfig.to_dict.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.common.types.ExtractorConfig.to_dict — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.common.types.ExtractorConfig.to_dict
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.common.types.ExtractorConfig.to_json.html b/docs/build/html/generated/cmoncrawl.common.types.ExtractorConfig.to_json.html
new file mode 100644
index 00000000..507ba6ec
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.common.types.ExtractorConfig.to_json.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.common.types.ExtractorConfig.to_json — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.common.types.ExtractorConfig.to_json
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.common.types.PipeMetadata.__init__.html b/docs/build/html/generated/cmoncrawl.common.types.PipeMetadata.__init__.html
new file mode 100644
index 00000000..9616e705
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.common.types.PipeMetadata.__init__.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.common.types.PipeMetadata.__init__ — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.common.types.PipeMetadata.__init__
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.common.types.PipeMetadata.html b/docs/build/html/generated/cmoncrawl.common.types.PipeMetadata.html
new file mode 100644
index 00000000..21dfb608
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.common.types.PipeMetadata.html
@@ -0,0 +1,586 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.common.types.PipeMetadata — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.common.types.PipeMetadata
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.common.types.RetrieveResponse.__init__.html b/docs/build/html/generated/cmoncrawl.common.types.RetrieveResponse.__init__.html
new file mode 100644
index 00000000..1e5c9af7
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.common.types.RetrieveResponse.__init__.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.common.types.RetrieveResponse.__init__ — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.common.types.RetrieveResponse.__init__
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.common.types.RetrieveResponse.__init__
+
+-
+RetrieveResponse.__init__(status: int, content: Any, reason: None | str) → None
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.common.types.RetrieveResponse.html b/docs/build/html/generated/cmoncrawl.common.types.RetrieveResponse.html
new file mode 100644
index 00000000..a94af50e
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.common.types.RetrieveResponse.html
@@ -0,0 +1,576 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.common.types.RetrieveResponse — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.common.types.RetrieveResponse
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.common.types.RetrieveResponse
+
+-
+class cmoncrawl.common.types.RetrieveResponse(status: int, content: Any, reason: None | str)
+
+-
+__init__(status: int, content: Any, reason: None | str) → None
+
+
+Methods
+
+
+
+
+
+
+__init__ (status, content, reason)
|
+ |
+
+
+
+Attributes
+
+
+
+
+
+
+status
|
+ |
+
+content
|
+ |
+
+reason
|
+ |
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.common.types.RoutesConfig.__init__.html b/docs/build/html/generated/cmoncrawl.common.types.RoutesConfig.__init__.html
new file mode 100644
index 00000000..299d5545
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.common.types.RoutesConfig.__init__.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.common.types.RoutesConfig.__init__ — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.common.types.RoutesConfig.__init__
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.common.types.RoutesConfig.__init__
+
+-
+RoutesConfig.__init__(regexes: list[str] = <factory>, extractors: list[cmoncrawl.common.types.ExtractorConfig] = <factory>) → None
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.common.types.RoutesConfig.from_dict.html b/docs/build/html/generated/cmoncrawl.common.types.RoutesConfig.from_dict.html
new file mode 100644
index 00000000..0601124b
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.common.types.RoutesConfig.from_dict.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.common.types.RoutesConfig.from_dict — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.common.types.RoutesConfig.from_dict
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.common.types.RoutesConfig.from_dict
+
+-
+classmethod RoutesConfig.from_dict(kvs: Optional[Union[dict, list, str, int, float, bool]], *, infer_missing=False) → A
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.common.types.RoutesConfig.from_json.html b/docs/build/html/generated/cmoncrawl.common.types.RoutesConfig.from_json.html
new file mode 100644
index 00000000..b2adc9a5
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.common.types.RoutesConfig.from_json.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.common.types.RoutesConfig.from_json — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.common.types.RoutesConfig.from_json
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.common.types.RoutesConfig.from_json
+
+-
+classmethod RoutesConfig.from_json(s: Union[str, bytes, bytearray], *, parse_float=None, parse_int=None, parse_constant=None, infer_missing=False, **kw) → A
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.common.types.RoutesConfig.html b/docs/build/html/generated/cmoncrawl.common.types.RoutesConfig.html
new file mode 100644
index 00000000..252772b9
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.common.types.RoutesConfig.html
@@ -0,0 +1,589 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.common.types.RoutesConfig — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.common.types.RoutesConfig
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.common.types.RoutesConfig
+
+-
+class cmoncrawl.common.types.RoutesConfig(regexes: list[str] = <factory>, extractors: list[cmoncrawl.common.types.ExtractorConfig] = <factory>)
+Configuration for extractors.
+
+-
+__init__(regexes: list[str] = <factory>, extractors: list[cmoncrawl.common.types.ExtractorConfig] = <factory>) → None
+
+
+Methods
+
+
+
+
+
+
+__init__ ([regexes, extractors])
|
+ |
+
+from_dict (kvs, *[, infer_missing])
|
+ |
+
+from_json (s, *[, parse_float, parse_int, ...])
|
+ |
+
+schema (*[, infer_missing, only, exclude, ...])
|
+ |
+
+to_dict ([encode_json])
|
+ |
+
+to_json (*[, skipkeys, ensure_ascii, ...])
|
+ |
+
+
+
+Attributes
+
+
+
+
+
+
+regexes
|
+ |
+
+extractors
|
+ |
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.common.types.RoutesConfig.schema.html b/docs/build/html/generated/cmoncrawl.common.types.RoutesConfig.schema.html
new file mode 100644
index 00000000..ec348b16
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.common.types.RoutesConfig.schema.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.common.types.RoutesConfig.schema — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.common.types.RoutesConfig.schema
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.common.types.RoutesConfig.schema
+
+-
+classmethod RoutesConfig.schema(*, infer_missing: bool = False, only=None, exclude=(), many: bool = False, context=None, load_only=(), dump_only=(), partial: bool = False, unknown=None) → SchemaF[A]
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.common.types.RoutesConfig.to_dict.html b/docs/build/html/generated/cmoncrawl.common.types.RoutesConfig.to_dict.html
new file mode 100644
index 00000000..4072ec99
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.common.types.RoutesConfig.to_dict.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.common.types.RoutesConfig.to_dict — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.common.types.RoutesConfig.to_dict
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.common.types.RoutesConfig.to_dict
+
+-
+RoutesConfig.to_dict(encode_json=False) → Dict[str, Optional[Union[dict, list, str, int, float, bool]]]
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.common.types.RoutesConfig.to_json.html b/docs/build/html/generated/cmoncrawl.common.types.RoutesConfig.to_json.html
new file mode 100644
index 00000000..b54e197a
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.common.types.RoutesConfig.to_json.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.common.types.RoutesConfig.to_json — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.common.types.RoutesConfig.to_json
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.common.types.RoutesConfig.to_json
+
+-
+RoutesConfig.to_json(*, skipkeys: bool = False, ensure_ascii: bool = True, check_circular: bool = True, allow_nan: bool = True, indent: Optional[Union[int, str]] = None, separators: Optional[Tuple[str, str]] = None, default: Optional[Callable] = None, sort_keys: bool = False, **kw) → str
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.common.types.html b/docs/build/html/generated/cmoncrawl.common.types.html
new file mode 100644
index 00000000..1e75b549
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.common.types.html
@@ -0,0 +1,499 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.common.types — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.common.types
+
+
+
+
+
+
+
+
+cmoncrawl.common.types
+Classes
+
+
+
+
+
+
+DomainCrawl ([domain, cdx_server, page])
|
+ |
+
+DomainRecord (filename, url, offset, length)
|
+ |
+
+ExtractConfig (extractors_path, routes)
|
+Configuration for run. |
+
+ExtractorConfig (name[, since, to])
|
+Configuration for extractor. |
+
+PipeMetadata (domain_record, article_data, ...)
|
+Metadata for a pipe. |
+
+RetrieveResponse (status, content, reason)
|
+ |
+
+RoutesConfig (regexes, extractors)
|
+Configuration for extractors. |
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.html b/docs/build/html/generated/cmoncrawl.html
new file mode 100644
index 00000000..08614bdf
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.html
@@ -0,0 +1,487 @@
+
+
+
+
+
+
+
+
+ cmoncrawl — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.extraction.filters.html b/docs/build/html/generated/cmoncrawl.processor.extraction.filters.html
new file mode 100644
index 00000000..83d937ff
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.extraction.filters.html
@@ -0,0 +1,484 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.extraction.filters — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.extraction.filters
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.extraction.html b/docs/build/html/generated/cmoncrawl.processor.extraction.html
new file mode 100644
index 00000000..ea9fced2
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.extraction.html
@@ -0,0 +1,484 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.extraction — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.extraction
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.extraction.utils.html b/docs/build/html/generated/cmoncrawl.processor.extraction.utils.html
new file mode 100644
index 00000000..4fe7c175
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.extraction.utils.html
@@ -0,0 +1,511 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.extraction.utils — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.extraction.utils
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.html b/docs/build/html/generated/cmoncrawl.processor.html
new file mode 100644
index 00000000..2f05fb6a
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.html
@@ -0,0 +1,484 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor
+
+
+
+
+
+
+
+
+cmoncrawl.processor
+Modules
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.downloader.AsyncDownloader.__init__.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.downloader.AsyncDownloader.__init__.html
new file mode 100644
index 00000000..9ef1b5b6
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.downloader.AsyncDownloader.__init__.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.downloader.AsyncDownloader.__init__ — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.downloader.AsyncDownloader.__init__
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.downloader.AsyncDownloader.__init__
+
+-
+AsyncDownloader.__init__(base_url: str = 'https://data.commoncrawl.org/', digest_verification: bool = True, max_retry: int = 5, sleep_step: int = 10, encoding: str = 'latin-1')
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.downloader.AsyncDownloader.aclose.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.downloader.AsyncDownloader.aclose.html
new file mode 100644
index 00000000..4c42f287
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.downloader.AsyncDownloader.aclose.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.downloader.AsyncDownloader.aclose — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.downloader.AsyncDownloader.aclose
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.downloader.AsyncDownloader.aclose
+
+-
+async AsyncDownloader.aclose(exc_type: Optional[Type[BaseException]], exc_val: BaseException | None, exc_tb: Optional[TracebackType] = None) → IDownloader
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.downloader.AsyncDownloader.aopen.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.downloader.AsyncDownloader.aopen.html
new file mode 100644
index 00000000..491d6760
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.downloader.AsyncDownloader.aopen.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.downloader.AsyncDownloader.aopen — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.downloader.AsyncDownloader.aopen
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.downloader.AsyncDownloader.aopen
+
+-
+async AsyncDownloader.aopen() → AsyncDownloader
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.downloader.AsyncDownloader.download.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.downloader.AsyncDownloader.download.html
new file mode 100644
index 00000000..b2db5e7e
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.downloader.AsyncDownloader.download.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.downloader.AsyncDownloader.download — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.downloader.AsyncDownloader.download
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.downloader.AsyncDownloader.download
+
+-
+async AsyncDownloader.download(domain_record: DomainRecord)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.downloader.AsyncDownloader.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.downloader.AsyncDownloader.html
new file mode 100644
index 00000000..d81f1815
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.downloader.AsyncDownloader.html
@@ -0,0 +1,570 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.downloader.AsyncDownloader — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.downloader.AsyncDownloader
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.downloader.AsyncDownloader
+
+-
+class cmoncrawl.processor.pipeline.downloader.AsyncDownloader(base_url: str = 'https://data.commoncrawl.org/', digest_verification: bool = True, max_retry: int = 5, sleep_step: int = 10, encoding: str = 'latin-1')
+
+-
+__init__(base_url: str = 'https://data.commoncrawl.org/', digest_verification: bool = True, max_retry: int = 5, sleep_step: int = 10, encoding: str = 'latin-1')
+
+
+Methods
+
+
+
+
+
+
+__init__ ([base_url, digest_verification, ...])
|
+ |
+
+aclose (exc_type, exc_val[, exc_tb])
|
+ |
+
+aopen ()
|
+ |
+
+download (domain_record)
|
+ |
+
+unwrap (response, domain_record)
|
+ |
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.downloader.AsyncDownloader.unwrap.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.downloader.AsyncDownloader.unwrap.html
new file mode 100644
index 00000000..481c5502
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.downloader.AsyncDownloader.unwrap.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.downloader.AsyncDownloader.unwrap — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.downloader.AsyncDownloader.unwrap
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.downloader.AsyncDownloader.unwrap
+
+-
+AsyncDownloader.unwrap(response: bytes, domain_record: DomainRecord) → List[Tuple[str, PipeMetadata]]
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.downloader.DownloaderDummy.__init__.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.downloader.DownloaderDummy.__init__.html
new file mode 100644
index 00000000..72f4c750
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.downloader.DownloaderDummy.__init__.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.downloader.DownloaderDummy.__init__ — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.downloader.DownloaderDummy.__init__
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.downloader.DownloaderDummy.__init__
+
+-
+DownloaderDummy.__init__(files: List[Path], url: Optional[str] = None, date: Optional[datetime] = None)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.downloader.DownloaderDummy.download.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.downloader.DownloaderDummy.download.html
new file mode 100644
index 00000000..bbc1e056
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.downloader.DownloaderDummy.download.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.downloader.DownloaderDummy.download — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.downloader.DownloaderDummy.download
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.downloader.DownloaderDummy.download
+
+-
+async DownloaderDummy.download(domain_record: DomainRecord)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.downloader.DownloaderDummy.extract_url.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.downloader.DownloaderDummy.extract_url.html
new file mode 100644
index 00000000..84cd6ed2
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.downloader.DownloaderDummy.extract_url.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.downloader.DownloaderDummy.extract_url — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.downloader.DownloaderDummy.extract_url
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.downloader.DownloaderDummy.extract_year.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.downloader.DownloaderDummy.extract_year.html
new file mode 100644
index 00000000..823adb37
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.downloader.DownloaderDummy.extract_year.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.downloader.DownloaderDummy.extract_year — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.downloader.DownloaderDummy.extract_year
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.downloader.DownloaderDummy.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.downloader.DownloaderDummy.html
new file mode 100644
index 00000000..83654aa4
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.downloader.DownloaderDummy.html
@@ -0,0 +1,573 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.downloader.DownloaderDummy — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.downloader.DownloaderDummy
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.downloader.DownloaderDummy
+
+-
+class cmoncrawl.processor.pipeline.downloader.DownloaderDummy(files: List[Path], url: Optional[str] = None, date: Optional[datetime] = None)
+Dummy downloader for testing
+It doesn’t download anything but return files passed in the constructor
+and extracts metadata from the file
+
+-
+__init__(files: List[Path], url: Optional[str] = None, date: Optional[datetime] = None)
+
+
+Methods
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.downloader.DownloaderDummy.mine_metadata.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.downloader.DownloaderDummy.mine_metadata.html
new file mode 100644
index 00000000..16c39bdd
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.downloader.DownloaderDummy.mine_metadata.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.downloader.DownloaderDummy.mine_metadata — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.downloader.DownloaderDummy.mine_metadata
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.downloader.IDownloader.__init__.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.downloader.IDownloader.__init__.html
new file mode 100644
index 00000000..d5253fe8
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.downloader.IDownloader.__init__.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.downloader.IDownloader.__init__ — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.downloader.IDownloader.__init__
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.downloader.IDownloader.__init__
+
+-
+IDownloader.__init__()
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.downloader.IDownloader.download.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.downloader.IDownloader.download.html
new file mode 100644
index 00000000..5ccadcd1
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.downloader.IDownloader.download.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.downloader.IDownloader.download — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.downloader.IDownloader.download
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.downloader.IDownloader.download
+
+-
+async IDownloader.download(domain_record: DomainRecord) → List[Tuple[str, PipeMetadata]]
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.downloader.IDownloader.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.downloader.IDownloader.html
new file mode 100644
index 00000000..bfad07e3
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.downloader.IDownloader.html
@@ -0,0 +1,561 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.downloader.IDownloader — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.downloader.IDownloader
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.downloader.IDownloader
+
+-
+class cmoncrawl.processor.pipeline.downloader.IDownloader
+
+-
+__init__()
+
+
+Methods
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.downloader.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.downloader.html
new file mode 100644
index 00000000..01143f55
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.downloader.html
@@ -0,0 +1,487 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.downloader — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.downloader
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.downloader
+Classes
+
+
+
+
+
+
+AsyncDownloader ([base_url, ...])
|
+ |
+
+DownloaderDummy (files[, url, date])
|
+Dummy downloader for testing It doesn't download anything but return files passed in the constructor and extracts metadata from the file |
+
+IDownloader ()
|
+ |
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.BaseExtractor.__init__.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.BaseExtractor.__init__.html
new file mode 100644
index 00000000..cc1d0167
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.BaseExtractor.__init__.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.extractor.BaseExtractor.__init__ — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.extractor.BaseExtractor.__init__
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.BaseExtractor.extract.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.BaseExtractor.extract.html
new file mode 100644
index 00000000..83bc1602
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.BaseExtractor.extract.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.extractor.BaseExtractor.extract — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.extractor.BaseExtractor.extract
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.BaseExtractor.extract_soup.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.BaseExtractor.extract_soup.html
new file mode 100644
index 00000000..72450446
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.BaseExtractor.extract_soup.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.extractor.BaseExtractor.extract_soup — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.extractor.BaseExtractor.extract_soup
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.BaseExtractor.filter_raw.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.BaseExtractor.filter_raw.html
new file mode 100644
index 00000000..cff07f5d
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.BaseExtractor.filter_raw.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.extractor.BaseExtractor.filter_raw — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.extractor.BaseExtractor.filter_raw
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.BaseExtractor.filter_soup.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.BaseExtractor.filter_soup.html
new file mode 100644
index 00000000..6ff9c6af
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.BaseExtractor.filter_soup.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.extractor.BaseExtractor.filter_soup — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.extractor.BaseExtractor.filter_soup
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.BaseExtractor.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.BaseExtractor.html
new file mode 100644
index 00000000..70708e94
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.BaseExtractor.html
@@ -0,0 +1,573 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.extractor.BaseExtractor — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.extractor.BaseExtractor
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.BaseExtractor.preprocess.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.BaseExtractor.preprocess.html
new file mode 100644
index 00000000..71e41bad
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.BaseExtractor.preprocess.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.extractor.BaseExtractor.preprocess — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.extractor.BaseExtractor.preprocess
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.__init__.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.__init__.html
new file mode 100644
index 00000000..09c074cf
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.__init__.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.__init__ — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.__init__
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.__init__
+
+-
+DomainRecordExtractor.__init__(filter_non_ok: bool = True)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.extract.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.extract.html
new file mode 100644
index 00000000..d8872af8
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.extract.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.extract — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.extract
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.extract
+
+-
+DomainRecordExtractor.extract(response: str, metadata: PipeMetadata) → Optional[Dict[str, Any]]
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.extract_soup.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.extract_soup.html
new file mode 100644
index 00000000..c10345fd
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.extract_soup.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.extract_soup — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.extract_soup
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.extract_soup
+
+-
+DomainRecordExtractor.extract_soup(soup: BeautifulSoup, metadata: PipeMetadata)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.filter_raw.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.filter_raw.html
new file mode 100644
index 00000000..8c6b4cf5
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.filter_raw.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.filter_raw — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.filter_raw
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.filter_raw
+
+-
+DomainRecordExtractor.filter_raw(response: str, metadata: PipeMetadata)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.filter_soup.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.filter_soup.html
new file mode 100644
index 00000000..9c4789a8
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.filter_soup.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.filter_soup — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.filter_soup
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.filter_soup
+
+-
+DomainRecordExtractor.filter_soup(soup: BeautifulSoup, metadata: PipeMetadata) → bool
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.html
new file mode 100644
index 00000000..5046f924
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.html
@@ -0,0 +1,574 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor
+
+-
+class cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor(filter_non_ok: bool = True)
+Dummy Extractor which simply extracts the html
+
+-
+__init__(filter_non_ok: bool = True)
+
+
+Methods
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.preprocess.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.preprocess.html
new file mode 100644
index 00000000..0ffce529
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.preprocess.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.preprocess — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.preprocess
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.preprocess
+
+-
+DomainRecordExtractor.preprocess(response: str, metadata: PipeMetadata) → str
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.HTMLExtractor.__init__.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.HTMLExtractor.__init__.html
new file mode 100644
index 00000000..182f1ab7
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.HTMLExtractor.__init__.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.extractor.HTMLExtractor.__init__ — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.extractor.HTMLExtractor.__init__
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.HTMLExtractor.extract.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.HTMLExtractor.extract.html
new file mode 100644
index 00000000..3afc3769
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.HTMLExtractor.extract.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.extractor.HTMLExtractor.extract — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.extractor.HTMLExtractor.extract
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.HTMLExtractor.extract_soup.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.HTMLExtractor.extract_soup.html
new file mode 100644
index 00000000..3b1a11e0
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.HTMLExtractor.extract_soup.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.extractor.HTMLExtractor.extract_soup — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.extractor.HTMLExtractor.extract_soup
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.HTMLExtractor.filter_raw.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.HTMLExtractor.filter_raw.html
new file mode 100644
index 00000000..a51c95d8
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.HTMLExtractor.filter_raw.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.extractor.HTMLExtractor.filter_raw — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.extractor.HTMLExtractor.filter_raw
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.HTMLExtractor.filter_soup.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.HTMLExtractor.filter_soup.html
new file mode 100644
index 00000000..bae2c70a
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.HTMLExtractor.filter_soup.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.extractor.HTMLExtractor.filter_soup — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.extractor.HTMLExtractor.filter_soup
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.HTMLExtractor.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.HTMLExtractor.html
new file mode 100644
index 00000000..6202602d
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.HTMLExtractor.html
@@ -0,0 +1,574 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.extractor.HTMLExtractor — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.extractor.HTMLExtractor
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.HTMLExtractor.preprocess.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.HTMLExtractor.preprocess.html
new file mode 100644
index 00000000..ffb49a2f
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.HTMLExtractor.preprocess.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.extractor.HTMLExtractor.preprocess — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.extractor.HTMLExtractor.preprocess
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.IExtractor.__init__.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.IExtractor.__init__.html
new file mode 100644
index 00000000..4c295575
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.IExtractor.__init__.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.extractor.IExtractor.__init__ — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.extractor.IExtractor.__init__
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.IExtractor.extract.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.IExtractor.extract.html
new file mode 100644
index 00000000..917b7f02
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.IExtractor.extract.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.extractor.IExtractor.extract — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.extractor.IExtractor.extract
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.IExtractor.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.IExtractor.html
new file mode 100644
index 00000000..d1940371
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.IExtractor.html
@@ -0,0 +1,561 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.extractor.IExtractor — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.extractor.IExtractor
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.html
new file mode 100644
index 00000000..9afd535e
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.extractor.html
@@ -0,0 +1,490 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.extractor — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.extractor
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.html
new file mode 100644
index 00000000..165ebd4e
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.html
@@ -0,0 +1,493 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline
+Modules
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.pipeline.ProcessorPipeline.__init__.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.pipeline.ProcessorPipeline.__init__.html
new file mode 100644
index 00000000..5b5bb79a
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.pipeline.ProcessorPipeline.__init__.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.pipeline.ProcessorPipeline.__init__ — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.pipeline.ProcessorPipeline.__init__
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.pipeline.ProcessorPipeline.__init__
+
+-
+ProcessorPipeline.__init__(router: IRouter, downloader: IDownloader, outstreamer: IStreamer)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.pipeline.ProcessorPipeline.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.pipeline.ProcessorPipeline.html
new file mode 100644
index 00000000..c3ad39b8
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.pipeline.ProcessorPipeline.html
@@ -0,0 +1,561 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.pipeline.ProcessorPipeline — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.pipeline.ProcessorPipeline
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.pipeline.ProcessorPipeline
+
+-
+class cmoncrawl.processor.pipeline.pipeline.ProcessorPipeline(router: IRouter, downloader: IDownloader, outstreamer: IStreamer)
+
+-
+__init__(router: IRouter, downloader: IDownloader, outstreamer: IStreamer)
+
+
+Methods
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.pipeline.ProcessorPipeline.process_domain_record.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.pipeline.ProcessorPipeline.process_domain_record.html
new file mode 100644
index 00000000..21d7ee8e
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.pipeline.ProcessorPipeline.process_domain_record.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.pipeline.ProcessorPipeline.process_domain_record — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.pipeline.ProcessorPipeline.process_domain_record
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.pipeline.ProcessorPipeline.process_domain_record
+
+-
+async ProcessorPipeline.process_domain_record(domain_record: DomainRecord, additional_info: Dict[str, Any])
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.pipeline.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.pipeline.html
new file mode 100644
index 00000000..161f9f45
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.pipeline.html
@@ -0,0 +1,481 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.pipeline — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.pipeline
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.pipeline
+Classes
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.router.IRouter.__init__.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.router.IRouter.__init__.html
new file mode 100644
index 00000000..d044b6da
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.router.IRouter.__init__.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.router.IRouter.__init__ — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.router.IRouter.__init__
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.router.IRouter.__init__
+
+-
+IRouter.__init__()
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.router.IRouter.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.router.IRouter.html
new file mode 100644
index 00000000..656f28b4
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.router.IRouter.html
@@ -0,0 +1,561 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.router.IRouter — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.router.IRouter
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.router.IRouter
+
+-
+class cmoncrawl.processor.pipeline.router.IRouter
+
+-
+__init__()
+
+
+Methods
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.router.IRouter.route.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.router.IRouter.route.html
new file mode 100644
index 00000000..949a0249
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.router.IRouter.route.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.router.IRouter.route — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.router.IRouter.route
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.router.IRouter.route
+
+-
+abstract IRouter.route(url: str | None, time: datetime.datetime | None, metadata: PipeMetadata) → IExtractor
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.router.Route.__init__.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.router.Route.__init__.html
new file mode 100644
index 00000000..26495f2e
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.router.Route.__init__.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.router.Route.__init__ — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.router.Route.__init__
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.router.Route.__init__
+
+-
+Route.__init__(name: str, regexes: List[Pattern[str]], since: datetime, to: datetime) → None
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.router.Route.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.router.Route.html
new file mode 100644
index 00000000..2fd6a24d
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.router.Route.html
@@ -0,0 +1,579 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.router.Route — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.router.Route
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.router.Route
+
+-
+class cmoncrawl.processor.pipeline.router.Route(name: str, regexes: List[re.Pattern[str]], since: datetime.datetime, to: datetime.datetime)
+
+-
+__init__(name: str, regexes: List[Pattern[str]], since: datetime, to: datetime) → None
+
+
+Methods
+
+
+
+
+
+
+__init__ (name, regexes, since, to)
|
+ |
+
+
+
+Attributes
+
+
+
+
+
+
+name
|
+ |
+
+regexes
|
+ |
+
+since
|
+ |
+
+to
|
+ |
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.router.Router.__init__.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.router.Router.__init__.html
new file mode 100644
index 00000000..b7ba5f6d
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.router.Router.__init__.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.router.Router.__init__ — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.router.Router.__init__
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.router.Router.__init__
+
+-
+Router.__init__()
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.router.Router.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.router.Router.html
new file mode 100644
index 00000000..ff996fb2
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.router.Router.html
@@ -0,0 +1,579 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.router.Router — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.router.Router
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.router.Router
+
+-
+class cmoncrawl.processor.pipeline.router.Router
+
+-
+__init__()
+
+
+Methods
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.router.Router.load_extractor.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.router.Router.load_extractor.html
new file mode 100644
index 00000000..2518ae6d
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.router.Router.load_extractor.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.router.Router.load_extractor — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.router.Router.load_extractor
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.router.Router.load_module.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.router.Router.load_module.html
new file mode 100644
index 00000000..78860f64
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.router.Router.load_module.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.router.Router.load_module — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.router.Router.load_module
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.router.Router.load_module
+
+-
+Router.load_module(module_path: Path)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.router.Router.load_module_as_extractor.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.router.Router.load_module_as_extractor.html
new file mode 100644
index 00000000..dc054fa8
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.router.Router.load_module_as_extractor.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.router.Router.load_module_as_extractor — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.router.Router.load_module_as_extractor
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.router.Router.load_modules.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.router.Router.load_modules.html
new file mode 100644
index 00000000..4efc22bd
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.router.Router.load_modules.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.router.Router.load_modules — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.router.Router.load_modules
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.router.Router.load_modules
+
+-
+Router.load_modules(folder: Path)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.router.Router.register_route.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.router.Router.register_route.html
new file mode 100644
index 00000000..bb48e33f
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.router.Router.register_route.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.router.Router.register_route — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.router.Router.register_route
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.router.Router.register_route
+
+-
+Router.register_route(name: str, regex: Union[str, List[str]], since: Optional[datetime] = None, to: Optional[datetime] = None)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.router.Router.register_routes.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.router.Router.register_routes.html
new file mode 100644
index 00000000..c8ce61f8
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.router.Router.register_routes.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.router.Router.register_routes — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.router.Router.register_routes
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.router.Router.register_routes
+
+-
+Router.register_routes(config: List[RoutesConfig])
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.router.Router.route.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.router.Router.route.html
new file mode 100644
index 00000000..91321e33
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.router.Router.route.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.router.Router.route — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.router.Router.route
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.router.Router.route
+
+-
+Router.route(url: str | None, time: datetime.datetime | None, metadata: PipeMetadata) → IExtractor
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.router.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.router.html
new file mode 100644
index 00000000..364dfa4a
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.router.html
@@ -0,0 +1,487 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.router — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.router
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.router
+Classes
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.__init__.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.__init__.html
new file mode 100644
index 00000000..535fad30
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.__init__.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.__init__ — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.__init__
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.__init__
+
+-
+BaseStreamerFile.__init__(root: Path, max_directory_size: int, max_file_size: int, extension: str, directory_prefix: str = 'directory_', max_retries: int = 3)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.clean_up.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.clean_up.html
new file mode 100644
index 00000000..2fd52f66
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.clean_up.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.clean_up — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.clean_up
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.clean_up
+
+-
+async BaseStreamerFile.clean_up() → None
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.get_file_name.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.get_file_name.html
new file mode 100644
index 00000000..cf390bee
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.get_file_name.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.get_file_name — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.get_file_name
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.get_file_name
+
+-
+BaseStreamerFile.get_file_name(metadata: PipeMetadata)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.html
new file mode 100644
index 00000000..6d7f7032
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.html
@@ -0,0 +1,571 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.streamer.BaseStreamerFile — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.streamer.BaseStreamerFile
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.streamer.BaseStreamerFile
+
+-
+class cmoncrawl.processor.pipeline.streamer.BaseStreamerFile(root: Path, max_directory_size: int, max_file_size: int, extension: str, directory_prefix: str = 'directory_', max_retries: int = 3)
+Abstract Class which defines the basic functionality of a file streamer
+
+-
+__init__(root: Path, max_directory_size: int, max_file_size: int, extension: str, directory_prefix: str = 'directory_', max_retries: int = 3)
+
+
+Methods
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.metadata_to_string.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.metadata_to_string.html
new file mode 100644
index 00000000..e91a8024
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.metadata_to_string.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.metadata_to_string — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.metadata_to_string
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.stream.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.stream.html
new file mode 100644
index 00000000..176fdddc
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.stream.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.stream — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.stream
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.stream
+
+-
+async BaseStreamerFile.stream(extracted_data: Dict[Any, Any], metadata: PipeMetadata) → Path
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.IStreamer.__init__.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.IStreamer.__init__.html
new file mode 100644
index 00000000..e9c1cd25
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.IStreamer.__init__.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.streamer.IStreamer.__init__ — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.streamer.IStreamer.__init__
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.streamer.IStreamer.__init__
+
+-
+IStreamer.__init__()
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.IStreamer.clean_up.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.IStreamer.clean_up.html
new file mode 100644
index 00000000..c3434ec8
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.IStreamer.clean_up.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.streamer.IStreamer.clean_up — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.streamer.IStreamer.clean_up
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.streamer.IStreamer.clean_up
+
+-
+abstract async IStreamer.clean_up() → None
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.IStreamer.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.IStreamer.html
new file mode 100644
index 00000000..c1b12137
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.IStreamer.html
@@ -0,0 +1,565 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.streamer.IStreamer — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.streamer.IStreamer
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.streamer.IStreamer
+
+-
+class cmoncrawl.processor.pipeline.streamer.IStreamer
+Base class for all outstreamers
+
+-
+__init__()
+
+
+Methods
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.IStreamer.stream.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.IStreamer.stream.html
new file mode 100644
index 00000000..80fcf032
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.IStreamer.stream.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.streamer.IStreamer.stream — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.streamer.IStreamer.stream
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.streamer.IStreamer.stream
+
+-
+abstract async IStreamer.stream(extracted_data: Dict[Any, Any], metadata: PipeMetadata) → Path
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.StreamerDummy.__init__.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.StreamerDummy.__init__.html
new file mode 100644
index 00000000..f271a713
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.StreamerDummy.__init__.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.streamer.StreamerDummy.__init__ — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.streamer.StreamerDummy.__init__
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.streamer.StreamerDummy.__init__
+
+-
+StreamerDummy.__init__()
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.StreamerDummy.clean_up.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.StreamerDummy.clean_up.html
new file mode 100644
index 00000000..39993fd8
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.StreamerDummy.clean_up.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.streamer.StreamerDummy.clean_up — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.streamer.StreamerDummy.clean_up
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.streamer.StreamerDummy.clean_up
+
+-
+async StreamerDummy.clean_up()
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.StreamerDummy.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.StreamerDummy.html
new file mode 100644
index 00000000..dd84a35c
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.StreamerDummy.html
@@ -0,0 +1,565 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.streamer.StreamerDummy — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.streamer.StreamerDummy
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.streamer.StreamerDummy
+
+-
+class cmoncrawl.processor.pipeline.streamer.StreamerDummy
+Dummy Streamer which keeps the output is memory
+
+-
+__init__()
+
+
+Methods
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.StreamerDummy.stream.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.StreamerDummy.stream.html
new file mode 100644
index 00000000..7489f131
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.StreamerDummy.stream.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.streamer.StreamerDummy.stream — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.streamer.StreamerDummy.stream
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.streamer.StreamerDummy.stream
+
+-
+async StreamerDummy.stream(extracted_data: Dict[Any, Any], metadata: PipeMetadata)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.__init__.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.__init__.html
new file mode 100644
index 00000000..0277e500
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.__init__.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.__init__ — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.__init__
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.__init__
+
+-
+StreamerFileHTML.__init__(root: Path, max_directory_size: int)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.clean_up.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.clean_up.html
new file mode 100644
index 00000000..0b63e34a
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.clean_up.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.clean_up — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.clean_up
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.clean_up
+
+-
+async StreamerFileHTML.clean_up() → None
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.get_file_name.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.get_file_name.html
new file mode 100644
index 00000000..14a51bb5
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.get_file_name.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.get_file_name — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.get_file_name
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.get_file_name
+
+-
+StreamerFileHTML.get_file_name(metadata: PipeMetadata)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.html
new file mode 100644
index 00000000..a3d60394
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.html
@@ -0,0 +1,570 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.streamer.StreamerFileHTML — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.streamer.StreamerFileHTML
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.streamer.StreamerFileHTML
+
+-
+class cmoncrawl.processor.pipeline.streamer.StreamerFileHTML(root: Path, max_directory_size: int)
+
+-
+__init__(root: Path, max_directory_size: int)
+
+
+Methods
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.metadata_to_string.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.metadata_to_string.html
new file mode 100644
index 00000000..ef1bdd94
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.metadata_to_string.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.metadata_to_string — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.metadata_to_string
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.stream.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.stream.html
new file mode 100644
index 00000000..d3dcb135
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.stream.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.stream — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.stream
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.stream
+
+-
+async StreamerFileHTML.stream(extracted_data: Dict[Any, Any], metadata: PipeMetadata) → Path
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.__init__.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.__init__.html
new file mode 100644
index 00000000..78c54fc9
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.__init__.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.__init__ — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.__init__
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.__init__
+
+-
+StreamerFileJSON.__init__(root: Path, max_directory_size: int, max_file_size: int, pretty: bool = False)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.clean_up.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.clean_up.html
new file mode 100644
index 00000000..d94ebe8d
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.clean_up.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.clean_up — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.clean_up
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.clean_up
+
+-
+async StreamerFileJSON.clean_up() → None
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.get_file_name.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.get_file_name.html
new file mode 100644
index 00000000..87919d79
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.get_file_name.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.get_file_name — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.get_file_name
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.get_file_name
+
+-
+StreamerFileJSON.get_file_name(metadata: PipeMetadata)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.html
new file mode 100644
index 00000000..a7bda35e
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.html
@@ -0,0 +1,570 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.streamer.StreamerFileJSON — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.streamer.StreamerFileJSON
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.streamer.StreamerFileJSON
+
+-
+class cmoncrawl.processor.pipeline.streamer.StreamerFileJSON(root: Path, max_directory_size: int, max_file_size: int, pretty: bool = False)
+
+-
+__init__(root: Path, max_directory_size: int, max_file_size: int, pretty: bool = False)
+
+
+Methods
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.metadata_to_string.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.metadata_to_string.html
new file mode 100644
index 00000000..837924f0
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.metadata_to_string.html
@@ -0,0 +1,519 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.metadata_to_string — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.metadata_to_string
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.stream.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.stream.html
new file mode 100644
index 00000000..6f2a4305
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.stream.html
@@ -0,0 +1,511 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.stream — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.stream
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.stream
+
+-
+async StreamerFileJSON.stream(extracted_data: Dict[Any, Any], metadata: PipeMetadata) → Path
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.html b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.html
new file mode 100644
index 00000000..60c1dd71
--- /dev/null
+++ b/docs/build/html/generated/cmoncrawl.processor.pipeline.streamer.html
@@ -0,0 +1,493 @@
+
+
+
+
+
+
+
+
+ cmoncrawl.processor.pipeline.streamer — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
cmoncrawl.processor.pipeline.streamer
+
+
+
+
+
+
+
+
+cmoncrawl.processor.pipeline.streamer
+Classes
+
+
+
+
+
+
+BaseStreamerFile (root, max_directory_size, ...)
|
+Abstract Class which defines the basic functionality of a file streamer |
+
+IStreamer ()
|
+Base class for all outstreamers |
+
+StreamerDummy ()
|
+Dummy Streamer which keeps the output is memory |
+
+StreamerFileHTML (root, max_directory_size)
|
+ |
+
+StreamerFileJSON (root, max_directory_size, ...)
|
+ |
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/genindex.html b/docs/build/html/genindex.html
new file mode 100644
index 00000000..7dffdcaf
--- /dev/null
+++ b/docs/build/html/genindex.html
@@ -0,0 +1,1022 @@
+
+
+
+
+
+
+
+ Index — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index
+
+
+
_
+ |
A
+ |
B
+ |
C
+ |
D
+ |
E
+ |
F
+ |
G
+ |
H
+ |
I
+ |
L
+ |
M
+ |
P
+ |
R
+ |
S
+ |
T
+ |
U
+
+
+
_
+
+
+ - __init__() (cmoncrawl.aggregator.index_query.IndexAggregator method), [1]
+
+
+ - (cmoncrawl.aggregator.utils.ndjson_decoder.Decoder method), [1]
+
+ - (cmoncrawl.common.types.DomainCrawl method), [1]
+
+ - (cmoncrawl.common.types.DomainRecord method), [1]
+
+ - (cmoncrawl.common.types.ExtractConfig method), [1]
+
+ - (cmoncrawl.common.types.ExtractorConfig method), [1]
+
+ - (cmoncrawl.common.types.PipeMetadata method), [1]
+
+ - (cmoncrawl.common.types.RetrieveResponse method), [1]
+
+ - (cmoncrawl.common.types.RoutesConfig method), [1]
+
+ - (cmoncrawl.processor.pipeline.downloader.AsyncDownloader method), [1]
+
+ - (cmoncrawl.processor.pipeline.downloader.DownloaderDummy method), [1]
+
+ - (cmoncrawl.processor.pipeline.downloader.IDownloader method), [1]
+
+ - (cmoncrawl.processor.pipeline.extractor.BaseExtractor method), [1]
+
+ - (cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor method), [1]
+
+ - (cmoncrawl.processor.pipeline.extractor.HTMLExtractor method), [1]
+
+ - (cmoncrawl.processor.pipeline.extractor.IExtractor method), [1]
+
+ - (cmoncrawl.processor.pipeline.pipeline.ProcessorPipeline method), [1]
+
+ - (cmoncrawl.processor.pipeline.router.IRouter method), [1]
+
+ - (cmoncrawl.processor.pipeline.router.Route method), [1]
+
+ - (cmoncrawl.processor.pipeline.router.Router method), [1]
+
+ - (cmoncrawl.processor.pipeline.streamer.BaseStreamerFile method), [1]
+
+ - (cmoncrawl.processor.pipeline.streamer.IStreamer method), [1]
+
+ - (cmoncrawl.processor.pipeline.streamer.StreamerDummy method), [1]
+
+ - (cmoncrawl.processor.pipeline.streamer.StreamerFileHTML method), [1]
+
+ - (cmoncrawl.processor.pipeline.streamer.StreamerFileJSON method), [1]
+
+
+ |
+
+
+
A
+
+
+
B
+
+
+
C
+
+ |
+
+ -
+ cmoncrawl.common.types
+
+
+ -
+ cmoncrawl.processor
+
+
+ -
+ cmoncrawl.processor.extraction
+
+
+ -
+ cmoncrawl.processor.extraction.filters
+
+
+ -
+ cmoncrawl.processor.extraction.utils
+
+
+ -
+ cmoncrawl.processor.pipeline
+
+
+ -
+ cmoncrawl.processor.pipeline.downloader
+
+
+ -
+ cmoncrawl.processor.pipeline.extractor
+
+
+ -
+ cmoncrawl.processor.pipeline.pipeline
+
+
+ -
+ cmoncrawl.processor.pipeline.router
+
+
+ -
+ cmoncrawl.processor.pipeline.streamer
+
+
+ |
+
+
+
D
+
+
+
E
+
+
+
F
+
+
+
G
+
+
+
H
+
+
+
I
+
+
+
L
+
+
+
M
+
+
+
P
+
+
+
R
+
+
+
S
+
+
+
T
+
+
+
U
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/index.html b/docs/build/html/index.html
new file mode 100644
index 00000000..88af3333
--- /dev/null
+++ b/docs/build/html/index.html
@@ -0,0 +1,601 @@
+
+
+
+
+
+
+
+
+ Welcome to CommonCrawl Extractor’s documentation! — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Welcome to CommonCrawl Extractor’s documentation!
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/misc/domain_record.html b/docs/build/html/misc/domain_record.html
new file mode 100644
index 00000000..16265eff
--- /dev/null
+++ b/docs/build/html/misc/domain_record.html
@@ -0,0 +1,559 @@
+
+
+
+
+
+
+
+
+ Domain Record — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Domain Record
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+Domain Record
+By domain record we refer to a strucuture that cotains the information
+about how to download a crawl of an url. It contains the following
+
+url: the url to crawl
+filename: the warc filename
+offset: the offset in the warc file
+length: the length of the html crawl
+digest [optional]: the digest of the html crawl
+encoding [optional]: the encoding of the html crawl
+timestamp [optional]: the timestamp of the crawl
+
+
+
+Domain Record JSONL format
+In order to use your own domain records with extract mode of cli,
+you must format them into follwoing json format
+{
+ "domain_record":
+ {
+ "url": "http://example.com",
+ "filename": "crawl.warc.gz",
+ "offset": 123,
+ "length": 456,
+ "digest: "sha1:1234567890abcdef",
+ "encoding": "utf-8",
+ "timestamp": "2018-01-01T00:00:00Z"
+ },
+ "additional_info":
+ {
+ "key1": "value1",
+ "key2": "value2"
+ }
+}
+
+
+Each such json must be on a separate line in a file.
+You don’t have to provide all the fields, only `url
, `filename`
,
+`offset`
and `length`
are required.
+The Athena SQL keys are:
+`u.url, cc.warc_filename, cc.warc_record_offset, cc.warc_record_length, cc.content_digest, cc.fetch_time`
+The `additional_info`
field is optional and can contain any additional
+information. It will be added to extracted fields as is. It’s usefull
+when you for example want to add to which set the url belongs to.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/misc/index.html b/docs/build/html/misc/index.html
new file mode 100644
index 00000000..fffaa912
--- /dev/null
+++ b/docs/build/html/misc/index.html
@@ -0,0 +1,476 @@
+
+
+
+
+
+
+
+
+ Miscellaneous — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/objects.inv b/docs/build/html/objects.inv
new file mode 100644
index 00000000..dc3705f8
Binary files /dev/null and b/docs/build/html/objects.inv differ
diff --git a/docs/build/html/prog_guide/index.html b/docs/build/html/prog_guide/index.html
new file mode 100644
index 00000000..c9b0711d
--- /dev/null
+++ b/docs/build/html/prog_guide/index.html
@@ -0,0 +1,489 @@
+
+
+
+
+
+
+
+
+ Programming Guide — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Programming Guide
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/prog_guide/overview.html b/docs/build/html/prog_guide/overview.html
new file mode 100644
index 00000000..2ab9a850
--- /dev/null
+++ b/docs/build/html/prog_guide/overview.html
@@ -0,0 +1,655 @@
+
+
+
+
+
+
+
+
+ Programming Guide — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Programming Guide
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+Programming Guide
+This section provides a brief overview of the project. It should give you
+and idea of how to create your custom extraction pipeline.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/prog_guide/pip.html b/docs/build/html/prog_guide/pip.html
new file mode 100644
index 00000000..e31316a1
--- /dev/null
+++ b/docs/build/html/prog_guide/pip.html
@@ -0,0 +1,563 @@
+
+
+
+
+
+
+
+
+ Custom Pipeline — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Custom Pipeline
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+Custom Pipeline
+
+
+Putting it all together
+We now show how to create very simple custom pipeline that will download and extract
+data into json programmatically.
+
+
Using the lib in code
+
from cmoncrawl.processor.pipeline.pipeline import ProcessorPipeline
+from cmoncrawl.processor.pipeline.downloader import AsyncDownloader
+from cmoncrawl.processor.pipeline.router import Router
+from cmoncrawl.processor.pipeline.streamer import StreamerFileJSON
+from cmoncrawl.common.loggers import all_purpose_logger
+from cmoncrawl.common.types import MatchType
+from commoncrawl.integrations.middleware.synchronized import query_and_extract
+from pathlib import Path
+
+downloader = AsyncDownloader()
+
+your_custom_extractor = YourCustomExtractor()
+router = Router()
+router.load_extractor("ext", your_custom_extractor)
+router.register_route("ext", ".*bbc.com.*")
+streamer = StreamerFileJSON(Path("extracted"))
+pipeline = ProcessorPipeline(downloader, router, streamer)
+
+index_agg = IndexAggregator(
+ domains=["bbc.com"],
+ match_type=MatchType.DOMAIN,
+ limit=1000,
+)
+
+processed_urls = await query_and_extract(index_agg, pipeline)
+
+
+
+The code will try to extract first 1000 pages from bbc.com, which will
+be extracted using YourCustomExtractor and save the results to json files.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/py-modindex.html b/docs/build/html/py-modindex.html
new file mode 100644
index 00000000..a515c67d
--- /dev/null
+++ b/docs/build/html/py-modindex.html
@@ -0,0 +1,514 @@
+
+
+
+
+
+
+
+ Python Module Index — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Python Module Index
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/search.html b/docs/build/html/search.html
new file mode 100644
index 00000000..f9281b5c
--- /dev/null
+++ b/docs/build/html/search.html
@@ -0,0 +1,438 @@
+
+
+
+
+
+
+
+ Search — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Search
+
+
+
+
+
+ Searching for multiple words only shows matches that contain
+ all words.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/build/html/searchindex.js b/docs/build/html/searchindex.js
new file mode 100644
index 00000000..521a55f0
--- /dev/null
+++ b/docs/build/html/searchindex.js
@@ -0,0 +1 @@
+Search.setIndex({"docnames": ["api", "cli/cli", "cli/download", "cli/extract", "cli/index", "extraction/config_file", "extraction/creating_extractor", "extraction/index", "extraction/utils", "generated/cmoncrawl", "generated/cmoncrawl.aggregator", "generated/cmoncrawl.aggregator.index_query", "generated/cmoncrawl.aggregator.index_query.IndexAggregator", "generated/cmoncrawl.aggregator.index_query.IndexAggregator.__init__", "generated/cmoncrawl.aggregator.index_query.IndexAggregator.aclose", "generated/cmoncrawl.aggregator.index_query.IndexAggregator.aopen", "generated/cmoncrawl.aggregator.index_query.IndexAggregator.get_all_CC_indexes", "generated/cmoncrawl.aggregator.index_query.IndexAggregator.get_captured_responses", "generated/cmoncrawl.aggregator.index_query.IndexAggregator.get_number_of_pages", "generated/cmoncrawl.aggregator.utils", "generated/cmoncrawl.aggregator.utils.helpers", "generated/cmoncrawl.aggregator.utils.ndjson_decoder", "generated/cmoncrawl.aggregator.utils.ndjson_decoder.Decoder", "generated/cmoncrawl.aggregator.utils.ndjson_decoder.Decoder.__init__", "generated/cmoncrawl.aggregator.utils.ndjson_decoder.Decoder.decode", "generated/cmoncrawl.aggregator.utils.ndjson_decoder.Decoder.raw_decode", "generated/cmoncrawl.common", "generated/cmoncrawl.common.loggers", "generated/cmoncrawl.common.types", "generated/cmoncrawl.common.types.DomainCrawl", "generated/cmoncrawl.common.types.DomainCrawl.__init__", "generated/cmoncrawl.common.types.DomainRecord", "generated/cmoncrawl.common.types.DomainRecord.__init__", "generated/cmoncrawl.common.types.DomainRecord.from_dict", "generated/cmoncrawl.common.types.DomainRecord.from_json", "generated/cmoncrawl.common.types.DomainRecord.schema", "generated/cmoncrawl.common.types.DomainRecord.to_dict", "generated/cmoncrawl.common.types.DomainRecord.to_json", "generated/cmoncrawl.common.types.ExtractConfig", "generated/cmoncrawl.common.types.ExtractConfig.__init__", "generated/cmoncrawl.common.types.ExtractConfig.from_dict", "generated/cmoncrawl.common.types.ExtractConfig.from_json", "generated/cmoncrawl.common.types.ExtractConfig.schema", "generated/cmoncrawl.common.types.ExtractConfig.to_dict", "generated/cmoncrawl.common.types.ExtractConfig.to_json", "generated/cmoncrawl.common.types.ExtractorConfig", "generated/cmoncrawl.common.types.ExtractorConfig.__init__", "generated/cmoncrawl.common.types.ExtractorConfig.from_dict", "generated/cmoncrawl.common.types.ExtractorConfig.from_json", "generated/cmoncrawl.common.types.ExtractorConfig.schema", "generated/cmoncrawl.common.types.ExtractorConfig.to_dict", "generated/cmoncrawl.common.types.ExtractorConfig.to_json", "generated/cmoncrawl.common.types.PipeMetadata", "generated/cmoncrawl.common.types.PipeMetadata.__init__", "generated/cmoncrawl.common.types.RetrieveResponse", "generated/cmoncrawl.common.types.RetrieveResponse.__init__", "generated/cmoncrawl.common.types.RoutesConfig", "generated/cmoncrawl.common.types.RoutesConfig.__init__", "generated/cmoncrawl.common.types.RoutesConfig.from_dict", "generated/cmoncrawl.common.types.RoutesConfig.from_json", "generated/cmoncrawl.common.types.RoutesConfig.schema", "generated/cmoncrawl.common.types.RoutesConfig.to_dict", "generated/cmoncrawl.common.types.RoutesConfig.to_json", "generated/cmoncrawl.processor", "generated/cmoncrawl.processor.extraction", "generated/cmoncrawl.processor.extraction.filters", "generated/cmoncrawl.processor.extraction.utils", "generated/cmoncrawl.processor.pipeline", "generated/cmoncrawl.processor.pipeline.downloader", "generated/cmoncrawl.processor.pipeline.downloader.AsyncDownloader", "generated/cmoncrawl.processor.pipeline.downloader.AsyncDownloader.__init__", "generated/cmoncrawl.processor.pipeline.downloader.AsyncDownloader.aclose", "generated/cmoncrawl.processor.pipeline.downloader.AsyncDownloader.aopen", "generated/cmoncrawl.processor.pipeline.downloader.AsyncDownloader.download", "generated/cmoncrawl.processor.pipeline.downloader.AsyncDownloader.unwrap", "generated/cmoncrawl.processor.pipeline.downloader.DownloaderDummy", "generated/cmoncrawl.processor.pipeline.downloader.DownloaderDummy.__init__", "generated/cmoncrawl.processor.pipeline.downloader.DownloaderDummy.download", "generated/cmoncrawl.processor.pipeline.downloader.DownloaderDummy.extract_url", "generated/cmoncrawl.processor.pipeline.downloader.DownloaderDummy.extract_year", "generated/cmoncrawl.processor.pipeline.downloader.DownloaderDummy.mine_metadata", "generated/cmoncrawl.processor.pipeline.downloader.IDownloader", "generated/cmoncrawl.processor.pipeline.downloader.IDownloader.__init__", "generated/cmoncrawl.processor.pipeline.downloader.IDownloader.download", "generated/cmoncrawl.processor.pipeline.extractor", "generated/cmoncrawl.processor.pipeline.extractor.BaseExtractor", "generated/cmoncrawl.processor.pipeline.extractor.BaseExtractor.__init__", "generated/cmoncrawl.processor.pipeline.extractor.BaseExtractor.extract", "generated/cmoncrawl.processor.pipeline.extractor.BaseExtractor.extract_soup", "generated/cmoncrawl.processor.pipeline.extractor.BaseExtractor.filter_raw", "generated/cmoncrawl.processor.pipeline.extractor.BaseExtractor.filter_soup", "generated/cmoncrawl.processor.pipeline.extractor.BaseExtractor.preprocess", "generated/cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor", "generated/cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.__init__", "generated/cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.extract", "generated/cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.extract_soup", "generated/cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.filter_raw", "generated/cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.filter_soup", "generated/cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.preprocess", "generated/cmoncrawl.processor.pipeline.extractor.HTMLExtractor", "generated/cmoncrawl.processor.pipeline.extractor.HTMLExtractor.__init__", "generated/cmoncrawl.processor.pipeline.extractor.HTMLExtractor.extract", "generated/cmoncrawl.processor.pipeline.extractor.HTMLExtractor.extract_soup", "generated/cmoncrawl.processor.pipeline.extractor.HTMLExtractor.filter_raw", "generated/cmoncrawl.processor.pipeline.extractor.HTMLExtractor.filter_soup", "generated/cmoncrawl.processor.pipeline.extractor.HTMLExtractor.preprocess", "generated/cmoncrawl.processor.pipeline.extractor.IExtractor", "generated/cmoncrawl.processor.pipeline.extractor.IExtractor.__init__", "generated/cmoncrawl.processor.pipeline.extractor.IExtractor.extract", "generated/cmoncrawl.processor.pipeline.pipeline", "generated/cmoncrawl.processor.pipeline.pipeline.ProcessorPipeline", "generated/cmoncrawl.processor.pipeline.pipeline.ProcessorPipeline.__init__", "generated/cmoncrawl.processor.pipeline.pipeline.ProcessorPipeline.process_domain_record", "generated/cmoncrawl.processor.pipeline.router", "generated/cmoncrawl.processor.pipeline.router.IRouter", "generated/cmoncrawl.processor.pipeline.router.IRouter.__init__", "generated/cmoncrawl.processor.pipeline.router.IRouter.route", "generated/cmoncrawl.processor.pipeline.router.Route", "generated/cmoncrawl.processor.pipeline.router.Route.__init__", "generated/cmoncrawl.processor.pipeline.router.Router", "generated/cmoncrawl.processor.pipeline.router.Router.__init__", "generated/cmoncrawl.processor.pipeline.router.Router.load_extractor", "generated/cmoncrawl.processor.pipeline.router.Router.load_module", "generated/cmoncrawl.processor.pipeline.router.Router.load_module_as_extractor", "generated/cmoncrawl.processor.pipeline.router.Router.load_modules", "generated/cmoncrawl.processor.pipeline.router.Router.register_route", "generated/cmoncrawl.processor.pipeline.router.Router.register_routes", "generated/cmoncrawl.processor.pipeline.router.Router.route", "generated/cmoncrawl.processor.pipeline.streamer", "generated/cmoncrawl.processor.pipeline.streamer.BaseStreamerFile", "generated/cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.__init__", "generated/cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.clean_up", "generated/cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.get_file_name", "generated/cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.metadata_to_string", "generated/cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.stream", "generated/cmoncrawl.processor.pipeline.streamer.IStreamer", "generated/cmoncrawl.processor.pipeline.streamer.IStreamer.__init__", "generated/cmoncrawl.processor.pipeline.streamer.IStreamer.clean_up", "generated/cmoncrawl.processor.pipeline.streamer.IStreamer.stream", "generated/cmoncrawl.processor.pipeline.streamer.StreamerDummy", "generated/cmoncrawl.processor.pipeline.streamer.StreamerDummy.__init__", "generated/cmoncrawl.processor.pipeline.streamer.StreamerDummy.clean_up", "generated/cmoncrawl.processor.pipeline.streamer.StreamerDummy.stream", "generated/cmoncrawl.processor.pipeline.streamer.StreamerFileHTML", "generated/cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.__init__", "generated/cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.clean_up", "generated/cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.get_file_name", "generated/cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.metadata_to_string", "generated/cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.stream", "generated/cmoncrawl.processor.pipeline.streamer.StreamerFileJSON", "generated/cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.__init__", "generated/cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.clean_up", "generated/cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.get_file_name", "generated/cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.metadata_to_string", "generated/cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.stream", "index", "misc/domain_record", "misc/index", "prog_guide/index", "prog_guide/overview", "prog_guide/pip", "usage"], "filenames": ["api.rst", "cli/cli.rst", "cli/download.rst", "cli/extract.rst", "cli/index.rst", "extraction/config_file.rst", "extraction/creating_extractor.rst", "extraction/index.rst", "extraction/utils.rst", "generated/cmoncrawl.rst", "generated/cmoncrawl.aggregator.rst", "generated/cmoncrawl.aggregator.index_query.rst", "generated/cmoncrawl.aggregator.index_query.IndexAggregator.rst", "generated/cmoncrawl.aggregator.index_query.IndexAggregator.__init__.rst", "generated/cmoncrawl.aggregator.index_query.IndexAggregator.aclose.rst", "generated/cmoncrawl.aggregator.index_query.IndexAggregator.aopen.rst", "generated/cmoncrawl.aggregator.index_query.IndexAggregator.get_all_CC_indexes.rst", "generated/cmoncrawl.aggregator.index_query.IndexAggregator.get_captured_responses.rst", "generated/cmoncrawl.aggregator.index_query.IndexAggregator.get_number_of_pages.rst", "generated/cmoncrawl.aggregator.utils.rst", "generated/cmoncrawl.aggregator.utils.helpers.rst", "generated/cmoncrawl.aggregator.utils.ndjson_decoder.rst", "generated/cmoncrawl.aggregator.utils.ndjson_decoder.Decoder.rst", "generated/cmoncrawl.aggregator.utils.ndjson_decoder.Decoder.__init__.rst", "generated/cmoncrawl.aggregator.utils.ndjson_decoder.Decoder.decode.rst", "generated/cmoncrawl.aggregator.utils.ndjson_decoder.Decoder.raw_decode.rst", "generated/cmoncrawl.common.rst", "generated/cmoncrawl.common.loggers.rst", "generated/cmoncrawl.common.types.rst", "generated/cmoncrawl.common.types.DomainCrawl.rst", "generated/cmoncrawl.common.types.DomainCrawl.__init__.rst", "generated/cmoncrawl.common.types.DomainRecord.rst", "generated/cmoncrawl.common.types.DomainRecord.__init__.rst", "generated/cmoncrawl.common.types.DomainRecord.from_dict.rst", "generated/cmoncrawl.common.types.DomainRecord.from_json.rst", "generated/cmoncrawl.common.types.DomainRecord.schema.rst", "generated/cmoncrawl.common.types.DomainRecord.to_dict.rst", "generated/cmoncrawl.common.types.DomainRecord.to_json.rst", "generated/cmoncrawl.common.types.ExtractConfig.rst", "generated/cmoncrawl.common.types.ExtractConfig.__init__.rst", "generated/cmoncrawl.common.types.ExtractConfig.from_dict.rst", "generated/cmoncrawl.common.types.ExtractConfig.from_json.rst", "generated/cmoncrawl.common.types.ExtractConfig.schema.rst", "generated/cmoncrawl.common.types.ExtractConfig.to_dict.rst", "generated/cmoncrawl.common.types.ExtractConfig.to_json.rst", "generated/cmoncrawl.common.types.ExtractorConfig.rst", "generated/cmoncrawl.common.types.ExtractorConfig.__init__.rst", "generated/cmoncrawl.common.types.ExtractorConfig.from_dict.rst", "generated/cmoncrawl.common.types.ExtractorConfig.from_json.rst", "generated/cmoncrawl.common.types.ExtractorConfig.schema.rst", "generated/cmoncrawl.common.types.ExtractorConfig.to_dict.rst", "generated/cmoncrawl.common.types.ExtractorConfig.to_json.rst", "generated/cmoncrawl.common.types.PipeMetadata.rst", "generated/cmoncrawl.common.types.PipeMetadata.__init__.rst", "generated/cmoncrawl.common.types.RetrieveResponse.rst", "generated/cmoncrawl.common.types.RetrieveResponse.__init__.rst", "generated/cmoncrawl.common.types.RoutesConfig.rst", "generated/cmoncrawl.common.types.RoutesConfig.__init__.rst", "generated/cmoncrawl.common.types.RoutesConfig.from_dict.rst", "generated/cmoncrawl.common.types.RoutesConfig.from_json.rst", "generated/cmoncrawl.common.types.RoutesConfig.schema.rst", "generated/cmoncrawl.common.types.RoutesConfig.to_dict.rst", "generated/cmoncrawl.common.types.RoutesConfig.to_json.rst", "generated/cmoncrawl.processor.rst", "generated/cmoncrawl.processor.extraction.rst", "generated/cmoncrawl.processor.extraction.filters.rst", "generated/cmoncrawl.processor.extraction.utils.rst", "generated/cmoncrawl.processor.pipeline.rst", "generated/cmoncrawl.processor.pipeline.downloader.rst", "generated/cmoncrawl.processor.pipeline.downloader.AsyncDownloader.rst", "generated/cmoncrawl.processor.pipeline.downloader.AsyncDownloader.__init__.rst", "generated/cmoncrawl.processor.pipeline.downloader.AsyncDownloader.aclose.rst", "generated/cmoncrawl.processor.pipeline.downloader.AsyncDownloader.aopen.rst", "generated/cmoncrawl.processor.pipeline.downloader.AsyncDownloader.download.rst", "generated/cmoncrawl.processor.pipeline.downloader.AsyncDownloader.unwrap.rst", "generated/cmoncrawl.processor.pipeline.downloader.DownloaderDummy.rst", "generated/cmoncrawl.processor.pipeline.downloader.DownloaderDummy.__init__.rst", "generated/cmoncrawl.processor.pipeline.downloader.DownloaderDummy.download.rst", "generated/cmoncrawl.processor.pipeline.downloader.DownloaderDummy.extract_url.rst", "generated/cmoncrawl.processor.pipeline.downloader.DownloaderDummy.extract_year.rst", "generated/cmoncrawl.processor.pipeline.downloader.DownloaderDummy.mine_metadata.rst", "generated/cmoncrawl.processor.pipeline.downloader.IDownloader.rst", "generated/cmoncrawl.processor.pipeline.downloader.IDownloader.__init__.rst", "generated/cmoncrawl.processor.pipeline.downloader.IDownloader.download.rst", "generated/cmoncrawl.processor.pipeline.extractor.rst", "generated/cmoncrawl.processor.pipeline.extractor.BaseExtractor.rst", "generated/cmoncrawl.processor.pipeline.extractor.BaseExtractor.__init__.rst", "generated/cmoncrawl.processor.pipeline.extractor.BaseExtractor.extract.rst", "generated/cmoncrawl.processor.pipeline.extractor.BaseExtractor.extract_soup.rst", "generated/cmoncrawl.processor.pipeline.extractor.BaseExtractor.filter_raw.rst", "generated/cmoncrawl.processor.pipeline.extractor.BaseExtractor.filter_soup.rst", "generated/cmoncrawl.processor.pipeline.extractor.BaseExtractor.preprocess.rst", "generated/cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.rst", "generated/cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.__init__.rst", "generated/cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.extract.rst", "generated/cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.extract_soup.rst", "generated/cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.filter_raw.rst", "generated/cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.filter_soup.rst", "generated/cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.preprocess.rst", "generated/cmoncrawl.processor.pipeline.extractor.HTMLExtractor.rst", "generated/cmoncrawl.processor.pipeline.extractor.HTMLExtractor.__init__.rst", "generated/cmoncrawl.processor.pipeline.extractor.HTMLExtractor.extract.rst", "generated/cmoncrawl.processor.pipeline.extractor.HTMLExtractor.extract_soup.rst", "generated/cmoncrawl.processor.pipeline.extractor.HTMLExtractor.filter_raw.rst", "generated/cmoncrawl.processor.pipeline.extractor.HTMLExtractor.filter_soup.rst", "generated/cmoncrawl.processor.pipeline.extractor.HTMLExtractor.preprocess.rst", "generated/cmoncrawl.processor.pipeline.extractor.IExtractor.rst", "generated/cmoncrawl.processor.pipeline.extractor.IExtractor.__init__.rst", "generated/cmoncrawl.processor.pipeline.extractor.IExtractor.extract.rst", "generated/cmoncrawl.processor.pipeline.pipeline.rst", "generated/cmoncrawl.processor.pipeline.pipeline.ProcessorPipeline.rst", "generated/cmoncrawl.processor.pipeline.pipeline.ProcessorPipeline.__init__.rst", "generated/cmoncrawl.processor.pipeline.pipeline.ProcessorPipeline.process_domain_record.rst", "generated/cmoncrawl.processor.pipeline.router.rst", "generated/cmoncrawl.processor.pipeline.router.IRouter.rst", "generated/cmoncrawl.processor.pipeline.router.IRouter.__init__.rst", "generated/cmoncrawl.processor.pipeline.router.IRouter.route.rst", "generated/cmoncrawl.processor.pipeline.router.Route.rst", "generated/cmoncrawl.processor.pipeline.router.Route.__init__.rst", "generated/cmoncrawl.processor.pipeline.router.Router.rst", "generated/cmoncrawl.processor.pipeline.router.Router.__init__.rst", "generated/cmoncrawl.processor.pipeline.router.Router.load_extractor.rst", "generated/cmoncrawl.processor.pipeline.router.Router.load_module.rst", "generated/cmoncrawl.processor.pipeline.router.Router.load_module_as_extractor.rst", "generated/cmoncrawl.processor.pipeline.router.Router.load_modules.rst", "generated/cmoncrawl.processor.pipeline.router.Router.register_route.rst", "generated/cmoncrawl.processor.pipeline.router.Router.register_routes.rst", "generated/cmoncrawl.processor.pipeline.router.Router.route.rst", "generated/cmoncrawl.processor.pipeline.streamer.rst", "generated/cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.rst", "generated/cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.__init__.rst", "generated/cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.clean_up.rst", "generated/cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.get_file_name.rst", "generated/cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.metadata_to_string.rst", "generated/cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.stream.rst", "generated/cmoncrawl.processor.pipeline.streamer.IStreamer.rst", "generated/cmoncrawl.processor.pipeline.streamer.IStreamer.__init__.rst", "generated/cmoncrawl.processor.pipeline.streamer.IStreamer.clean_up.rst", "generated/cmoncrawl.processor.pipeline.streamer.IStreamer.stream.rst", "generated/cmoncrawl.processor.pipeline.streamer.StreamerDummy.rst", "generated/cmoncrawl.processor.pipeline.streamer.StreamerDummy.__init__.rst", "generated/cmoncrawl.processor.pipeline.streamer.StreamerDummy.clean_up.rst", "generated/cmoncrawl.processor.pipeline.streamer.StreamerDummy.stream.rst", "generated/cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.rst", "generated/cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.__init__.rst", "generated/cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.clean_up.rst", "generated/cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.get_file_name.rst", "generated/cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.metadata_to_string.rst", "generated/cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.stream.rst", "generated/cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.rst", "generated/cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.__init__.rst", "generated/cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.clean_up.rst", "generated/cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.get_file_name.rst", "generated/cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.metadata_to_string.rst", "generated/cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.stream.rst", "index.rst", "misc/domain_record.rst", "misc/index.rst", "prog_guide/index.rst", "prog_guide/overview.rst", "prog_guide/pip.rst", "usage.rst"], "titles": ["API", "Command Line Interface", "Command Line Download", "Command line Extract", "Command Line Interface", "Extractor config file", "Custom Extractor", "Extraction", "Extraction utils", "cmoncrawl", "cmoncrawl.aggregator", "cmoncrawl.aggregator.index_query", "cmoncrawl.aggregator.index_query.IndexAggregator", "cmoncrawl.aggregator.index_query.IndexAggregator.__init__", "cmoncrawl.aggregator.index_query.IndexAggregator.aclose", "cmoncrawl.aggregator.index_query.IndexAggregator.aopen", "cmoncrawl.aggregator.index_query.IndexAggregator.get_all_CC_indexes", "cmoncrawl.aggregator.index_query.IndexAggregator.get_captured_responses", "cmoncrawl.aggregator.index_query.IndexAggregator.get_number_of_pages", "cmoncrawl.aggregator.utils", "cmoncrawl.aggregator.utils.helpers", "cmoncrawl.aggregator.utils.ndjson_decoder", "cmoncrawl.aggregator.utils.ndjson_decoder.Decoder", "cmoncrawl.aggregator.utils.ndjson_decoder.Decoder.__init__", "cmoncrawl.aggregator.utils.ndjson_decoder.Decoder.decode", "cmoncrawl.aggregator.utils.ndjson_decoder.Decoder.raw_decode", "cmoncrawl.common", "cmoncrawl.common.loggers", "cmoncrawl.common.types", "cmoncrawl.common.types.DomainCrawl", "cmoncrawl.common.types.DomainCrawl.__init__", "cmoncrawl.common.types.DomainRecord", "cmoncrawl.common.types.DomainRecord.__init__", "cmoncrawl.common.types.DomainRecord.from_dict", "cmoncrawl.common.types.DomainRecord.from_json", "cmoncrawl.common.types.DomainRecord.schema", "cmoncrawl.common.types.DomainRecord.to_dict", "cmoncrawl.common.types.DomainRecord.to_json", "cmoncrawl.common.types.ExtractConfig", "cmoncrawl.common.types.ExtractConfig.__init__", "cmoncrawl.common.types.ExtractConfig.from_dict", "cmoncrawl.common.types.ExtractConfig.from_json", "cmoncrawl.common.types.ExtractConfig.schema", "cmoncrawl.common.types.ExtractConfig.to_dict", "cmoncrawl.common.types.ExtractConfig.to_json", "cmoncrawl.common.types.ExtractorConfig", "cmoncrawl.common.types.ExtractorConfig.__init__", "cmoncrawl.common.types.ExtractorConfig.from_dict", "cmoncrawl.common.types.ExtractorConfig.from_json", "cmoncrawl.common.types.ExtractorConfig.schema", "cmoncrawl.common.types.ExtractorConfig.to_dict", "cmoncrawl.common.types.ExtractorConfig.to_json", "cmoncrawl.common.types.PipeMetadata", "cmoncrawl.common.types.PipeMetadata.__init__", "cmoncrawl.common.types.RetrieveResponse", "cmoncrawl.common.types.RetrieveResponse.__init__", "cmoncrawl.common.types.RoutesConfig", "cmoncrawl.common.types.RoutesConfig.__init__", "cmoncrawl.common.types.RoutesConfig.from_dict", "cmoncrawl.common.types.RoutesConfig.from_json", "cmoncrawl.common.types.RoutesConfig.schema", "cmoncrawl.common.types.RoutesConfig.to_dict", "cmoncrawl.common.types.RoutesConfig.to_json", "cmoncrawl.processor", "cmoncrawl.processor.extraction", "cmoncrawl.processor.extraction.filters", "cmoncrawl.processor.extraction.utils", "cmoncrawl.processor.pipeline", "cmoncrawl.processor.pipeline.downloader", "cmoncrawl.processor.pipeline.downloader.AsyncDownloader", "cmoncrawl.processor.pipeline.downloader.AsyncDownloader.__init__", "cmoncrawl.processor.pipeline.downloader.AsyncDownloader.aclose", "cmoncrawl.processor.pipeline.downloader.AsyncDownloader.aopen", "cmoncrawl.processor.pipeline.downloader.AsyncDownloader.download", "cmoncrawl.processor.pipeline.downloader.AsyncDownloader.unwrap", "cmoncrawl.processor.pipeline.downloader.DownloaderDummy", "cmoncrawl.processor.pipeline.downloader.DownloaderDummy.__init__", "cmoncrawl.processor.pipeline.downloader.DownloaderDummy.download", "cmoncrawl.processor.pipeline.downloader.DownloaderDummy.extract_url", "cmoncrawl.processor.pipeline.downloader.DownloaderDummy.extract_year", "cmoncrawl.processor.pipeline.downloader.DownloaderDummy.mine_metadata", "cmoncrawl.processor.pipeline.downloader.IDownloader", "cmoncrawl.processor.pipeline.downloader.IDownloader.__init__", "cmoncrawl.processor.pipeline.downloader.IDownloader.download", "cmoncrawl.processor.pipeline.extractor", "cmoncrawl.processor.pipeline.extractor.BaseExtractor", "cmoncrawl.processor.pipeline.extractor.BaseExtractor.__init__", "cmoncrawl.processor.pipeline.extractor.BaseExtractor.extract", "cmoncrawl.processor.pipeline.extractor.BaseExtractor.extract_soup", "cmoncrawl.processor.pipeline.extractor.BaseExtractor.filter_raw", "cmoncrawl.processor.pipeline.extractor.BaseExtractor.filter_soup", "cmoncrawl.processor.pipeline.extractor.BaseExtractor.preprocess", "cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor", "cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.__init__", "cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.extract", "cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.extract_soup", "cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.filter_raw", "cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.filter_soup", "cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.preprocess", "cmoncrawl.processor.pipeline.extractor.HTMLExtractor", "cmoncrawl.processor.pipeline.extractor.HTMLExtractor.__init__", "cmoncrawl.processor.pipeline.extractor.HTMLExtractor.extract", "cmoncrawl.processor.pipeline.extractor.HTMLExtractor.extract_soup", "cmoncrawl.processor.pipeline.extractor.HTMLExtractor.filter_raw", "cmoncrawl.processor.pipeline.extractor.HTMLExtractor.filter_soup", "cmoncrawl.processor.pipeline.extractor.HTMLExtractor.preprocess", "cmoncrawl.processor.pipeline.extractor.IExtractor", "cmoncrawl.processor.pipeline.extractor.IExtractor.__init__", "cmoncrawl.processor.pipeline.extractor.IExtractor.extract", "cmoncrawl.processor.pipeline.pipeline", "cmoncrawl.processor.pipeline.pipeline.ProcessorPipeline", "cmoncrawl.processor.pipeline.pipeline.ProcessorPipeline.__init__", "cmoncrawl.processor.pipeline.pipeline.ProcessorPipeline.process_domain_record", "cmoncrawl.processor.pipeline.router", "cmoncrawl.processor.pipeline.router.IRouter", "cmoncrawl.processor.pipeline.router.IRouter.__init__", "cmoncrawl.processor.pipeline.router.IRouter.route", "cmoncrawl.processor.pipeline.router.Route", "cmoncrawl.processor.pipeline.router.Route.__init__", "cmoncrawl.processor.pipeline.router.Router", "cmoncrawl.processor.pipeline.router.Router.__init__", "cmoncrawl.processor.pipeline.router.Router.load_extractor", "cmoncrawl.processor.pipeline.router.Router.load_module", "cmoncrawl.processor.pipeline.router.Router.load_module_as_extractor", "cmoncrawl.processor.pipeline.router.Router.load_modules", "cmoncrawl.processor.pipeline.router.Router.register_route", "cmoncrawl.processor.pipeline.router.Router.register_routes", "cmoncrawl.processor.pipeline.router.Router.route", "cmoncrawl.processor.pipeline.streamer", "cmoncrawl.processor.pipeline.streamer.BaseStreamerFile", "cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.__init__", "cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.clean_up", "cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.get_file_name", "cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.metadata_to_string", "cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.stream", "cmoncrawl.processor.pipeline.streamer.IStreamer", "cmoncrawl.processor.pipeline.streamer.IStreamer.__init__", "cmoncrawl.processor.pipeline.streamer.IStreamer.clean_up", "cmoncrawl.processor.pipeline.streamer.IStreamer.stream", "cmoncrawl.processor.pipeline.streamer.StreamerDummy", "cmoncrawl.processor.pipeline.streamer.StreamerDummy.__init__", "cmoncrawl.processor.pipeline.streamer.StreamerDummy.clean_up", "cmoncrawl.processor.pipeline.streamer.StreamerDummy.stream", "cmoncrawl.processor.pipeline.streamer.StreamerFileHTML", "cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.__init__", "cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.clean_up", "cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.get_file_name", "cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.metadata_to_string", "cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.stream", "cmoncrawl.processor.pipeline.streamer.StreamerFileJSON", "cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.__init__", "cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.clean_up", "cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.get_file_name", "cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.metadata_to_string", "cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.stream", "Welcome to CommonCrawl Extractor\u2019s documentation!", "Domain Record", "Miscellaneous", "Programming Guide", "Programming Guide", "Custom Pipeline", "Usage"], "terms": {"The": [1, 2, 3, 5, 6, 8, 22, 23, 156, 159, 160, 161], "i": [1, 3, 5, 6, 8, 22, 23, 139, 156, 159, 161], "simpl": [1, 160, 161], "wrapper": 1, "around": 1, "librari": [1, 161], "It": [1, 5, 6, 8, 75, 156, 159, 161], "provid": [1, 2, 8, 22, 23, 156, 159, 161], "two": [1, 6, 161], "main": [1, 2], "function": [1, 8, 11, 20, 65, 66, 129, 159], "download": [1, 3, 4, 110, 111, 155, 156, 158, 160, 161], "sampl": 1, "either": [1, 159], "domain": [1, 2, 3, 5, 12, 13, 17, 18, 29, 30, 155, 157, 160, 161], "record": [1, 4, 5, 155, 157, 160, 161], "html": [1, 2, 4, 6, 92, 99, 155, 156, 159, 161], "from": [1, 2, 3, 5, 6, 8, 25, 75, 155, 158, 160, 161], "common": [1, 2, 5, 6, 155, 158, 160], "crawl": [1, 2, 5, 6, 155, 156, 158], "index": [1, 2, 12, 13, 25, 155, 159, 161], "extract": [1, 4, 5, 75, 92, 99, 155, 156, 158, 160, 161], "an": [1, 3, 6, 22, 23, 156, 159], "content": [1, 3, 5, 54, 55, 78, 80, 159], "can": [1, 3, 5, 6, 22, 23, 25, 156, 159, 160, 161], "also": [1, 5, 6, 22, 23, 161], "directli": [1, 3, 159], "take": [1, 3, 6, 22, 23], "data": [1, 3, 5, 6, 8, 25, 69, 70, 159, 160, 161], "both": [1, 3, 8, 161], "ar": [1, 2, 3, 5, 6, 8, 22, 23, 156, 159, 160], "invok": 1, "us": [1, 3, 5, 6, 8, 22, 23, 25, 156, 159, 160, 161], "cmon": [1, 2, 3, 5], "follow": [1, 2, 3, 5, 22, 23, 156], "requir": [1, 8, 156], "argument": [1, 4, 155], "first": [1, 2, 3, 159, 160, 161], "1000": [1, 2, 160], "com": [1, 2, 3, 156, 160], "match_typ": [1, 2, 160], "limit": [1, 2, 12, 13, 160], "dr_output": [1, 2, 3], "100": [1, 2, 3], "html_output": [1, 2, 3], "them": [1, 3, 156, 159, 160], "your": [1, 3, 6, 8, 156, 159, 161], "extractor": [1, 3, 7, 45, 56, 57, 121, 158, 161], "config": [1, 3, 7, 126, 155, 159], "json": [1, 2, 3, 12, 13, 22, 23, 24, 25, 156, 159, 160], "extracted_output": [1, 3], "jsonl": [1, 2, 3, 155, 157], "second": [1, 2, 3], "tool": [2, 3], "server": [2, 3], "queri": [2, 155, 158, 160], "commoncrawl": [2, 12, 13, 69, 70, 158, 160, 161], "need": [2, 3, 5, 6, 159, 161], "thi": [2, 3, 5, 6, 22, 23, 25, 159, 161], "order": [2, 3, 5, 22, 23, 156, 159], "url": [2, 3, 5, 31, 32, 75, 76, 116, 127, 156, 159], "output": [2, 3, 139, 159], "path": [2, 3, 5, 38, 39, 75, 76, 79, 80, 122, 123, 124, 129, 130, 134, 138, 143, 144, 148, 149, 150, 154, 160], "directori": [2, 3, 5], "file": [2, 3, 6, 7, 8, 75, 76, 129, 155, 156, 158, 160], "In": [2, 5, 156, 159], "contain": [2, 3, 8, 24, 156, 159], "one": [2, 3, 5, 22, 23, 159], "each": [2, 3, 5, 156], "found": [2, 6], "multipl": [2, 8, 159], "format": [2, 3, 5, 155, 157, 159], "max": [2, 3], "number": [2, 3, 22, 23], "sinc": [2, 5, 12, 13, 17, 45, 46, 117, 118, 125, 159], "start": [2, 5], "date": [2, 3, 5, 75, 76, 159], "iso": [2, 3, 5], "e": [2, 3, 5, 22, 23], "g": [2, 3, 5, 22, 23], "2020": 2, "01": [2, 3, 5, 156], "TO": 2, "end": [2, 5, 25], "cc_server": [2, 12, 13], "must": [2, 5, 6, 156], "whole": [2, 160], "http": [2, 3, 5, 12, 13, 69, 70, 156], "org": [2, 12, 13, 69, 70], "cc": [2, 156], "2023": 2, "14": 2, "max_retri": [2, 3, 12, 13, 17, 18, 69, 70, 129, 130], "retri": 2, "request": [2, 161], "increas": 2, "when": [2, 3, 156], "fail": [2, 3], "sleep_step": [2, 3, 12, 13, 17, 18, 69, 70], "addit": [2, 3, 156], "add": [2, 3, 5, 156], "sleep": [2, 3], "time": [2, 3, 116, 127], "between": [2, 3], "attempt": [2, 3], "tell": 2, "you": [2, 3, 5, 6, 156, 159, 160, 161], "slow": [2, 159, 161], "down": 2, "One": 2, "exact": 2, "prefix": 2, "host": 2, "match": [2, 5, 159], "type": [2, 6, 14, 71, 160], "refer": [2, 3, 6, 156, 159], "cdx": 2, "api": [2, 155, 161], "more": [2, 3, 161], "inform": [2, 3, 156, 159], "max_directory_s": [2, 3, 129, 130, 143, 144, 149, 150], "per": [2, 3, 159], "filter_non_200": 2, "filter": [2, 7, 155, 158], "out": [2, 8, 155, 158], "non": 2, "200": 2, "statu": [2, 54, 55], "code": [2, 7, 22, 23, 155, 160], "max_crawls_per_fil": [2, 3], "mode": [3, 4, 155, 156], "config_path": 3, "rule": 3, "output_path": 3, "To": [3, 5, 160, 161], "creat": [3, 5, 6, 8, 159, 160, 161], "see": [3, 6, 161], "yield": 3, "same": [3, 5], "which": [3, 5, 92, 99, 129, 139, 156, 159, 160, 161], "For": [3, 6, 159], "new": [3, 159], "name": [3, 5, 6, 45, 46, 52, 53, 117, 118, 121, 125], "after": 3, "appropri": [3, 159], "If": [3, 5, 6, 22, 23, 159, 161], "have": [3, 6, 25, 156], "got": 3, "some": 3, "other": [3, 161], "wai": [3, 6, 159, 161], "aw": [3, 161], "athena": [3, 156, 161], "pleas": 3, "describ": [3, 159], "how": [3, 155, 156, 158, 160, 161], "n_proc": 3, "process": [3, 159, 161], "paralel": 3, "level": 3, "thu": [3, 5, 6, 159], "singl": [3, 160], "": [3, 24, 25, 34, 41, 48, 59, 156, 159, 160, 161], "useless": 3, "than": [3, 159], "warc": [3, 156, 159], "2021": 3, "default": [3, 22, 23, 37, 44, 51, 62, 159], "todai": 3, "were": [3, 159], "By": [3, 22, 23, 156, 159], "try": [3, 160], "infer": 3, "10": [3, 69, 70], "www": [3, 5], "go": 3, "build": 3, "gonna": 3, "appreaci": 3, "specifi": [3, 5, 22, 23, 159], "what": [3, 159], "becaus": 3, "those": [3, 22, 23], "dure": [3, 5], "rout": [3, 5, 38, 39], "exampl": [4, 7, 155, 156], "posit": [4, 155], "option": [4, 5, 6, 12, 13, 14, 18, 31, 32, 33, 36, 37, 40, 43, 44, 45, 46, 47, 50, 51, 52, 53, 58, 61, 62, 71, 75, 76, 85, 86, 87, 88, 94, 101, 108, 125, 155, 156], "extractors_path": [5, 38, 39], "folder": [5, 124], "regex": [5, 56, 57, 117, 118, 125], "my_extractor": 5, "string": [5, 6, 22, 23, 25], "my_extractor2": 5, "another_regex": 5, "where": [5, 25], "locat": 5, "rel": 5, "current": [5, 159, 161], "work": 5, "list": [5, 12, 13, 16, 22, 23, 33, 36, 38, 39, 40, 43, 47, 50, 56, 57, 58, 61, 74, 75, 76, 83, 117, 118, 125, 126], "dictionari": [5, 6], "kei": [5, 156], "At": 5, "least": 5, "ha": 5, "python": [5, 24, 25], "without": [5, 6], "extens": [5, 6, 129, 130], "set": [5, 6, 156], "variabl": [5, 6], "overrid": 5, "valid": 5, "full": 5, "2009": 5, "01t00": [5, 156], "00": [5, 156], "all": [5, 6, 8, 135, 155, 156, 158, 159], "given": [5, 22, 23], "a_extractor": 5, "a_extractor2": 5, "b_extractor": 5, "cz": 5, "2010": 5, "cmon2": 5, "happen": 5, "A": [5, 33, 34, 35, 40, 41, 42, 47, 48, 49, 58, 59, 60, 159], "cralw": 5, "2012": 5, "might": [5, 6, 159], "want": [5, 6, 156, 159, 161], "put": [5, 155, 158], "problem": 5, "we": [5, 156, 159, 160], "load": [5, 159], "But": [5, 159], "don": [5, 6, 8, 156, 159], "t": [5, 6, 8, 22, 23, 75, 156, 159], "import": [5, 6, 159, 160], "sy": 5, "pathlib": [5, 160], "append": 5, "__file__": 5, "parent": 5, "router": [5, 110, 111, 159, 160], "everi": [5, 22, 23, 159], "should": [5, 6, 159, 161], "ani": [5, 8, 52, 53, 54, 55, 87, 88, 94, 101, 108, 112, 133, 134, 138, 142, 147, 148, 153, 154, 156], "untrust": 5, "within": 6, "cli": [6, 156], "implement": [6, 22, 23, 159], "cmoncrawl": [6, 8, 155, 159, 160], "processor": [6, 8, 155, 159, 160], "pipelin": [6, 155, 158, 159, 161], "iextractor": [6, 116, 121, 127], "class": [6, 11, 12, 21, 22, 23, 28, 29, 31, 38, 45, 52, 54, 56, 68, 69, 75, 81, 84, 85, 92, 99, 106, 109, 110, 113, 114, 117, 119, 128, 129, 135, 139, 143, 149, 159], "howev": 6, "most": [6, 161], "case": [6, 159, 161], "assum": [6, 159], "pars": [6, 159], "beautifulsoup": [6, 78, 88, 90, 95, 97, 102, 104], "onli": [6, 35, 42, 49, 60, 156, 159], "method": [6, 12, 22, 29, 31, 38, 45, 52, 54, 56, 69, 75, 81, 85, 92, 99, 106, 110, 114, 117, 119, 129, 135, 139, 143, 149, 160], "extract_soup": 6, "object": [6, 22, 23], "metadata": [6, 52, 75, 87, 88, 89, 90, 91, 94, 95, 96, 97, 98, 101, 102, 103, 104, 105, 108, 116, 127, 132, 134, 138, 142, 146, 148, 152, 154, 159], "pipemetadata": [6, 74, 83, 87, 88, 89, 90, 91, 94, 95, 96, 97, 98, 101, 102, 103, 104, 105, 108, 116, 127, 132, 134, 138, 142, 146, 148, 152, 154], "return": [6, 8, 22, 23, 24, 25, 75, 159], "none": [6, 8, 12, 13, 14, 18, 22, 23, 29, 30, 31, 32, 34, 35, 37, 38, 39, 41, 42, 44, 45, 46, 48, 49, 51, 52, 53, 54, 55, 56, 57, 59, 60, 62, 71, 75, 76, 85, 86, 116, 117, 118, 125, 127, 131, 137, 145, 151], "page": [6, 17, 29, 30, 155, 158, 160], "extact": 6, "haven": 6, "additionali": 6, "filter_raw": [6, 159], "raw": [6, 159], "true": [6, 22, 23, 37, 44, 51, 62, 69, 70, 92, 93, 99, 100], "fals": [6, 22, 23, 33, 34, 35, 36, 37, 40, 41, 42, 43, 44, 47, 48, 49, 50, 51, 58, 59, 60, 61, 62, 149, 150], "otherwis": 6, "decid": 6, "base": [6, 135, 159], "effici": 6, "now": [6, 160], "soup": [6, 8, 88, 90, 95, 97, 102, 104, 159], "done": 6, "filter_soup": [6, 159], "final": 6, "said": 6, "here": 6, "titl": 6, "ext": [6, 160], "py": [6, 7, 155], "titleextractor": 6, "def": 6, "self": 6, "dict": [6, 22, 23, 33, 36, 40, 43, 47, 50, 52, 53, 58, 61, 87, 88, 94, 101, 108, 112, 133, 134, 138, 142, 147, 148, 153, 154, 159], "text": 6, "bool": [6, 33, 35, 36, 37, 40, 42, 43, 44, 47, 49, 50, 51, 58, 60, 61, 62, 69, 70, 89, 90, 92, 93, 97, 99, 100, 104, 149, 150], "config_fil": 6, "would": [6, 159], "custom": [7, 22, 23, 155, 158, 159, 161], "baseextractor": [7, 155, 159], "structur": [7, 155], "__init__": [7, 12, 22, 29, 31, 38, 45, 52, 54, 56, 69, 75, 81, 85, 92, 99, 106, 110, 114, 117, 119, 129, 135, 139, 143, 149, 155], "arbitrari": [7, 155], "execut": [7, 155], "util": [7, 155], "utili": 8, "defin": [8, 22, 23, 129, 159], "helper": 8, "must_exist_filt": 8, "ulr": 8, "css": 8, "selector": 8, "must_not_exist_filt": 8, "check_requir": 8, "check": 8, "present": 8, "chain_transform": 8, "chain": 8, "transform": 8, "broken": 8, "especi": 8, "useful": [8, 156], "select": 8, "etc": 8, "extract_transform": 8, "tag": 8, "modul": [9, 10, 19, 26, 63, 64, 67, 155], "str": [12, 13, 16, 17, 18, 24, 25, 29, 30, 31, 32, 33, 34, 36, 37, 40, 41, 43, 44, 45, 46, 47, 48, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 61, 62, 69, 70, 74, 75, 76, 80, 83, 85, 86, 87, 88, 89, 91, 94, 96, 98, 101, 103, 105, 108, 112, 116, 117, 118, 121, 125, 127, 129, 130, 133, 147, 153], "cc_indexes_serv": [12, 13], "collinfo": [12, 13], "datetim": [12, 13, 17, 31, 32, 45, 46, 75, 76, 116, 117, 118, 125, 127], "1": [12, 13, 17, 52, 53, 69, 70, 155, 158], "0": [12, 13, 17, 22, 23, 25, 29, 30], "9999": [12, 13, 17], "12": [12, 13, 17], "31": [12, 13, 17, 22, 23], "23": [12, 13, 17], "59": [12, 13, 17], "999999": [12, 13, 17], "int": [12, 13, 17, 18, 22, 23, 29, 30, 31, 32, 33, 36, 37, 40, 43, 44, 47, 50, 51, 54, 55, 58, 61, 62, 69, 70, 129, 130, 143, 144, 149, 150], "5": [12, 13, 69, 70, 155, 158], "prefetch_s": [12, 13], "3": [12, 13, 129, 130, 155, 158], "20": [12, 13], "async": [14, 15, 16, 17, 18, 71, 72, 73, 77, 83, 112, 131, 134, 137, 138, 141, 142, 145, 148, 151, 154], "exc_typ": [14, 71], "baseexcept": [14, 71], "exc_val": [14, 71], "exc_tb": [14, 71], "tracebacktyp": [14, 71], "static": [16, 17, 18], "client": [16, 17, 18], "clientsess": [16, 17, 18], "cdx_server": [16, 17, 18, 29, 30], "page_s": 18, "object_hook": [22, 23], "parse_float": [22, 23, 34, 41, 48, 59], "parse_int": [22, 23, 34, 41, 48, 59], "parse_const": [22, 23, 34, 41, 48, 59], "strict": [22, 23], "object_pairs_hook": [22, 23], "call": [22, 23, 160], "result": [22, 23, 160], "its": [22, 23], "valu": [22, 23, 159], "place": [22, 23], "deseri": [22, 23], "support": [22, 23, 159, 161], "rpc": [22, 23], "hint": [22, 23], "pair": [22, 23], "instead": [22, 23], "featur": [22, 23], "prioriti": [22, 23], "float": [22, 23, 33, 36, 40, 43, 47, 50, 58, 61], "equival": [22, 23], "num_str": [22, 23], "anoth": [22, 23, 159], "datatyp": [22, 23], "parser": [22, 23, 159], "decim": [22, 23], "integ": [22, 23], "infin": [22, 23], "nan": [22, 23], "rais": [22, 23], "except": [22, 23, 160], "invalid": [22, 23], "encount": [22, 23], "control": [22, 23, 161], "charact": [22, 23], "allow": [22, 23, 161], "insid": [22, 23], "context": [22, 23, 35, 42, 49, 60], "rang": [22, 23], "includ": [22, 23], "tab": [22, 23], "n": [22, 23], "r": [22, 23], "arg": 24, "kwarg": 24, "represent": [24, 25], "instanc": 24, "document": [24, 25], "idx": 25, "begin": 25, "2": [25, 155, 158], "tupl": [25, 37, 44, 51, 62, 74, 83], "mai": 25, "extran": 25, "attribut": [29, 31, 38, 45, 52, 54, 56, 117], "filenam": [31, 32, 156], "offset": [31, 32, 156, 159], "length": [31, 32, 156, 159], "digest": [31, 32, 156], "encod": [31, 32, 52, 53, 69, 70, 85, 86, 156], "timestamp": [31, 32, 156], "classmethod": [33, 34, 35, 40, 41, 42, 47, 48, 49, 58, 59, 60], "kv": [33, 40, 47, 58], "union": [33, 34, 36, 37, 40, 41, 43, 44, 47, 48, 50, 51, 58, 59, 61, 62, 125], "infer_miss": [33, 34, 35, 40, 41, 42, 47, 48, 49, 58, 59, 60], "byte": [34, 41, 48, 59, 74], "bytearrai": [34, 41, 48, 59], "kw": [34, 37, 41, 44, 48, 51, 59, 62], "exclud": [35, 42, 49, 60], "mani": [35, 42, 49, 60], "load_onli": [35, 42, 49, 60], "dump_onli": [35, 42, 49, 60], "partial": [35, 42, 49, 60], "unknown": [35, 42, 49, 60], "schemaf": [35, 42, 49, 60], "encode_json": [36, 43, 50, 61], "skipkei": [37, 44, 51, 62], "ensure_ascii": [37, 44, 51, 62], "check_circular": [37, 44, 51, 62], "allow_nan": [37, 44, 51, 62], "indent": [37, 44, 51, 62], "separ": [37, 44, 51, 62, 156], "callabl": [37, 44, 51, 62], "sort_kei": [37, 44, 51, 62], "routesconfig": [38, 39, 126], "configur": [38, 45, 56], "run": [38, 160, 161], "domain_record": [52, 53, 73, 74, 77, 83, 112, 156], "domainrecord": [52, 53, 73, 74, 77, 83, 112], "article_data": [52, 53], "factori": [52, 53, 56, 57], "warc_head": [52, 53], "http_header": [52, 53], "latin": [52, 53, 69, 70], "pipe": 52, "reason": [54, 55], "extractorconfig": [56, 57], "base_url": [69, 70], "digest_verif": [69, 70], "idownload": [71, 110, 111], "respons": [74, 87, 89, 91, 94, 96, 98, 101, 103, 105, 108], "dummi": [75, 92, 99, 139], "test": 75, "doesn": 75, "anyth": 75, "pass": [75, 159, 160], "constructor": 75, "file_path": [79, 80], "abstract": [88, 108, 116, 129, 133, 137, 138], "filter_non_ok": [92, 93, 99, 100], "simpli": [92, 99, 159, 160], "irout": [110, 111], "outstream": [110, 111, 135, 159], "istream": [110, 111, 159], "additional_info": [112, 156], "re": 117, "pattern": [117, 118], "module_path": [122, 123], "root": [129, 130, 143, 144, 149, 150], "max_file_s": [129, 130, 149, 150], "directory_prefix": [129, 130], "directory_": [129, 130], "basic": [129, 159], "extracted_data": [133, 134, 138, 142, 147, 148, 153, 154], "keep": 139, "memori": 139, "pretti": [149, 150], "usag": 155, "workflow": 155, "command": [155, 161], "line": [155, 156, 159, 161], "interfac": [155, 161], "program": 155, "guid": 155, "theori": [155, 158], "choos": [155, 158], "4": [155, 158], "web": [155, 158], "field": [155, 156, 158], "6": [155, 158], "save": [155, 158, 160], "togeth": [155, 158], "miscellan": 155, "aggreg": [155, 159], "search": 155, "strucutur": 156, "cotain": 156, "about": 156, "own": [156, 159], "follwo": 156, "gz": 156, "123": 156, "456": 156, "sha1": 156, "1234567890abcdef": 156, "utf": 156, "8": 156, "2018": 156, "00z": 156, "key1": 156, "value1": 156, "key2": 156, "value2": 156, "sql": 156, "u": 156, "warc_filenam": 156, "warc_record_offset": 156, "warc_record_length": 156, "content_digest": 156, "fetch_tim": 156, "ad": 156, "belong": 156, "section": 159, "brief": 159, "overview": 159, "project": 159, "give": 159, "idea": 159, "get": [159, 161], "commmoncrawl": 159, "find": 159, "link": 159, "condit": 159, "step": [159, 161], "handl": [159, 160], "while": [159, 161], "rest": 159, "store": 159, "multitud": 159, "resourc": 159, "our": 159, "bunch": 159, "possibl": 159, "part": 159, "collect": 159, "map": 159, "month": 159, "releas": 159, "understand": 159, "even": 159, "wa": 159, "certain": 159, "older": 159, "respect": 159, "make": [159, 161], "sure": 159, "miss": 159, "With": 159, "continu": 159, "index_queri": 159, "indexaggreg": [159, 160], "node": 159, "relat": 159, "queue": 159, "asyncdownload": [159, 160], "onc": 159, "dynam": 159, "definit": 159, "against": 159, "develop": 159, "chosen": 159, "drop": [159, 161], "fast": 159, "Or": 159, "wait": 159, "convers": 159, "just": 159, "streamer": [159, 160], "deriv": 159, "basestreamerfil": 159, "left": 159, "streamerfilejson": [159, 160], "streamerfilehtml": 159, "like": [159, 161], "differ": 159, "saver": 159, "inherit": 159, "chang": 159, "creation": 159, "initi": 160, "processorpipelin": 160, "process_domain_record": 160, "caller": 160, "wish": 160, "show": 160, "veri": 160, "programmat": [160, 161], "lib": 160, "logger": 160, "all_purpose_logg": 160, "matchtyp": 160, "integr": 160, "middlewar": 160, "synchron": 160, "query_and_extract": 160, "your_custom_extractor": 160, "yourcustomextractor": 160, "load_extractor": 160, "register_rout": 160, "bbc": 160, "index_agg": 160, "processed_url": 160, "await": 160, "design": 161, "interact": 161, "framework": 161, "suffic": 161, "99": 161, "over": 161, "further": 161, "share": 161, "so": 161, "author": 161, "law": 161, "issu": 161, "skip": 161, "under": 161, "cirmust": 161, "complet": 161, "throttl": 161, "prefer": 161, "super": 161, "cheap": 161, "parallel": 161}, "objects": {"": [[9, 0, 0, "-", "cmoncrawl"]], "cmoncrawl": [[10, 0, 0, "-", "aggregator"], [26, 0, 0, "-", "common"], [63, 0, 0, "-", "processor"]], "cmoncrawl.aggregator": [[11, 0, 0, "-", "index_query"], [19, 0, 0, "-", "utils"]], "cmoncrawl.aggregator.index_query": [[12, 1, 1, "", "IndexAggregator"]], "cmoncrawl.aggregator.index_query.IndexAggregator": [[13, 2, 1, "", "__init__"], [14, 2, 1, "", "aclose"], [15, 2, 1, "", "aopen"], [16, 2, 1, "", "get_all_CC_indexes"], [17, 2, 1, "", "get_captured_responses"], [18, 2, 1, "", "get_number_of_pages"]], "cmoncrawl.aggregator.utils": [[20, 0, 0, "-", "helpers"], [21, 0, 0, "-", "ndjson_decoder"]], "cmoncrawl.aggregator.utils.ndjson_decoder": [[22, 1, 1, "", "Decoder"]], "cmoncrawl.aggregator.utils.ndjson_decoder.Decoder": [[23, 2, 1, "", "__init__"], [24, 2, 1, "", "decode"], [25, 2, 1, "", "raw_decode"]], "cmoncrawl.common": [[27, 0, 0, "-", "loggers"], [28, 0, 0, "-", "types"]], "cmoncrawl.common.types": [[29, 1, 1, "", "DomainCrawl"], [31, 1, 1, "", "DomainRecord"], [38, 1, 1, "", "ExtractConfig"], [45, 1, 1, "", "ExtractorConfig"], [52, 1, 1, "", "PipeMetadata"], [54, 1, 1, "", "RetrieveResponse"], [56, 1, 1, "", "RoutesConfig"]], "cmoncrawl.common.types.DomainCrawl": [[30, 2, 1, "", "__init__"]], "cmoncrawl.common.types.DomainRecord": [[32, 2, 1, "", "__init__"], [33, 2, 1, "", "from_dict"], [34, 2, 1, "", "from_json"], [35, 2, 1, "", "schema"], [36, 2, 1, "", "to_dict"], [37, 2, 1, "", "to_json"]], "cmoncrawl.common.types.ExtractConfig": [[39, 2, 1, "", "__init__"], [40, 2, 1, "", "from_dict"], [41, 2, 1, "", "from_json"], [42, 2, 1, "", "schema"], [43, 2, 1, "", "to_dict"], [44, 2, 1, "", "to_json"]], "cmoncrawl.common.types.ExtractorConfig": [[46, 2, 1, "", "__init__"], [47, 2, 1, "", "from_dict"], [48, 2, 1, "", "from_json"], [49, 2, 1, "", "schema"], [50, 2, 1, "", "to_dict"], [51, 2, 1, "", "to_json"]], "cmoncrawl.common.types.PipeMetadata": [[53, 2, 1, "", "__init__"]], "cmoncrawl.common.types.RetrieveResponse": [[55, 2, 1, "", "__init__"]], "cmoncrawl.common.types.RoutesConfig": [[57, 2, 1, "", "__init__"], [58, 2, 1, "", "from_dict"], [59, 2, 1, "", "from_json"], [60, 2, 1, "", "schema"], [61, 2, 1, "", "to_dict"], [62, 2, 1, "", "to_json"]], "cmoncrawl.processor": [[64, 0, 0, "-", "extraction"], [67, 0, 0, "-", "pipeline"]], "cmoncrawl.processor.extraction": [[65, 0, 0, "-", "filters"], [66, 0, 0, "-", "utils"]], "cmoncrawl.processor.pipeline": [[68, 0, 0, "-", "downloader"], [84, 0, 0, "-", "extractor"], [109, 0, 0, "-", "pipeline"], [113, 0, 0, "-", "router"], [128, 0, 0, "-", "streamer"]], "cmoncrawl.processor.pipeline.downloader": [[69, 1, 1, "", "AsyncDownloader"], [75, 1, 1, "", "DownloaderDummy"], [81, 1, 1, "", "IDownloader"]], "cmoncrawl.processor.pipeline.downloader.AsyncDownloader": [[70, 2, 1, "", "__init__"], [71, 2, 1, "", "aclose"], [72, 2, 1, "", "aopen"], [73, 2, 1, "", "download"], [74, 2, 1, "", "unwrap"]], "cmoncrawl.processor.pipeline.downloader.DownloaderDummy": [[76, 2, 1, "", "__init__"], [77, 2, 1, "", "download"], [78, 2, 1, "", "extract_url"], [79, 2, 1, "", "extract_year"], [80, 2, 1, "", "mine_metadata"]], "cmoncrawl.processor.pipeline.downloader.IDownloader": [[82, 2, 1, "", "__init__"], [83, 2, 1, "", "download"]], "cmoncrawl.processor.pipeline.extractor": [[85, 1, 1, "", "BaseExtractor"], [92, 1, 1, "", "DomainRecordExtractor"], [99, 1, 1, "", "HTMLExtractor"], [106, 1, 1, "", "IExtractor"]], "cmoncrawl.processor.pipeline.extractor.BaseExtractor": [[86, 2, 1, "", "__init__"], [87, 2, 1, "", "extract"], [88, 2, 1, "", "extract_soup"], [89, 2, 1, "", "filter_raw"], [90, 2, 1, "", "filter_soup"], [91, 2, 1, "", "preprocess"]], "cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor": [[93, 2, 1, "", "__init__"], [94, 2, 1, "", "extract"], [95, 2, 1, "", "extract_soup"], [96, 2, 1, "", "filter_raw"], [97, 2, 1, "", "filter_soup"], [98, 2, 1, "", "preprocess"]], "cmoncrawl.processor.pipeline.extractor.HTMLExtractor": [[100, 2, 1, "", "__init__"], [101, 2, 1, "", "extract"], [102, 2, 1, "", "extract_soup"], [103, 2, 1, "", "filter_raw"], [104, 2, 1, "", "filter_soup"], [105, 2, 1, "", "preprocess"]], "cmoncrawl.processor.pipeline.extractor.IExtractor": [[107, 2, 1, "", "__init__"], [108, 2, 1, "", "extract"]], "cmoncrawl.processor.pipeline.pipeline": [[110, 1, 1, "", "ProcessorPipeline"]], "cmoncrawl.processor.pipeline.pipeline.ProcessorPipeline": [[111, 2, 1, "", "__init__"], [112, 2, 1, "", "process_domain_record"]], "cmoncrawl.processor.pipeline.router": [[114, 1, 1, "", "IRouter"], [117, 1, 1, "", "Route"], [119, 1, 1, "", "Router"]], "cmoncrawl.processor.pipeline.router.IRouter": [[115, 2, 1, "", "__init__"], [116, 2, 1, "", "route"]], "cmoncrawl.processor.pipeline.router.Route": [[118, 2, 1, "", "__init__"]], "cmoncrawl.processor.pipeline.router.Router": [[120, 2, 1, "", "__init__"], [121, 2, 1, "", "load_extractor"], [122, 2, 1, "", "load_module"], [123, 2, 1, "", "load_module_as_extractor"], [124, 2, 1, "", "load_modules"], [125, 2, 1, "", "register_route"], [126, 2, 1, "", "register_routes"], [127, 2, 1, "", "route"]], "cmoncrawl.processor.pipeline.streamer": [[129, 1, 1, "", "BaseStreamerFile"], [135, 1, 1, "", "IStreamer"], [139, 1, 1, "", "StreamerDummy"], [143, 1, 1, "", "StreamerFileHTML"], [149, 1, 1, "", "StreamerFileJSON"]], "cmoncrawl.processor.pipeline.streamer.BaseStreamerFile": [[130, 2, 1, "", "__init__"], [131, 2, 1, "", "clean_up"], [132, 2, 1, "", "get_file_name"], [133, 2, 1, "", "metadata_to_string"], [134, 2, 1, "", "stream"]], "cmoncrawl.processor.pipeline.streamer.IStreamer": [[136, 2, 1, "", "__init__"], [137, 2, 1, "", "clean_up"], [138, 2, 1, "", "stream"]], "cmoncrawl.processor.pipeline.streamer.StreamerDummy": [[140, 2, 1, "", "__init__"], [141, 2, 1, "", "clean_up"], [142, 2, 1, "", "stream"]], "cmoncrawl.processor.pipeline.streamer.StreamerFileHTML": [[144, 2, 1, "", "__init__"], [145, 2, 1, "", "clean_up"], [146, 2, 1, "", "get_file_name"], [147, 2, 1, "", "metadata_to_string"], [148, 2, 1, "", "stream"]], "cmoncrawl.processor.pipeline.streamer.StreamerFileJSON": [[150, 2, 1, "", "__init__"], [151, 2, 1, "", "clean_up"], [152, 2, 1, "", "get_file_name"], [153, 2, 1, "", "metadata_to_string"], [154, 2, 1, "", "stream"]]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:method"}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "method", "Python method"]}, "titleterms": {"api": 0, "command": [1, 2, 3, 4], "line": [1, 2, 3, 4], "interfac": [1, 4], "exampl": [1, 2, 3, 5, 6], "download": [2, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 159], "posit": [2, 3], "argument": [2, 3], "option": [2, 3], "record": [2, 3, 156], "mode": 2, "extract": [3, 6, 7, 8, 64, 65, 66, 87, 94, 101, 108, 159], "html": 3, "content": [4, 7, 155, 157, 158], "extractor": [5, 6, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 155, 159], "config": 5, "file": [5, 159], "structur": 5, "__init__": [5, 13, 23, 30, 32, 39, 46, 53, 55, 57, 70, 76, 82, 86, 93, 100, 107, 111, 115, 118, 120, 130, 136, 140, 144, 150], "py": 5, "arbitrari": 5, "code": 5, "execut": 5, "custom": [6, 160], "baseextractor": [6, 85, 86, 87, 88, 89, 90, 91], "filter": [6, 8, 65, 159], "util": [8, 19, 20, 21, 22, 23, 24, 25, 66], "cmoncrawl": [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154], "aggreg": [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25], "index_queri": [11, 12, 13, 14, 15, 16, 17, 18], "indexaggreg": [12, 13, 14, 15, 16, 17, 18], "aclos": [14, 71], "aopen": [15, 72], "get_all_cc_index": 16, "get_captured_respons": 17, "get_number_of_pag": 18, "helper": 20, "ndjson_decod": [21, 22, 23, 24, 25], "decod": [22, 23, 24, 25], "raw_decod": 25, "common": [26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 159], "logger": 27, "type": [28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62], "domaincrawl": [29, 30], "domainrecord": [31, 32, 33, 34, 35, 36, 37], "from_dict": [33, 40, 47, 58], "from_json": [34, 41, 48, 59], "schema": [35, 42, 49, 60], "to_dict": [36, 43, 50, 61], "to_json": [37, 44, 51, 62], "extractconfig": [38, 39, 40, 41, 42, 43, 44], "extractorconfig": [45, 46, 47, 48, 49, 50, 51], "pipemetadata": [52, 53], "retrieverespons": [54, 55], "routesconfig": [56, 57, 58, 59, 60, 61, 62], "processor": [63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154], "pipelin": [67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 160], "asyncdownload": [69, 70, 71, 72, 73, 74], "unwrap": 74, "downloaderdummi": [75, 76, 77, 78, 79, 80], "extract_url": 78, "extract_year": 79, "mine_metadata": 80, "idownload": [81, 82, 83], "extract_soup": [88, 95, 102], "filter_raw": [89, 96, 103], "filter_soup": [90, 97, 104], "preprocess": [91, 98, 105], "domainrecordextractor": [92, 93, 94, 95, 96, 97, 98], "htmlextractor": [99, 100, 101, 102, 103, 104, 105], "iextractor": [106, 107, 108], "processorpipelin": [110, 111, 112], "process_domain_record": 112, "router": [113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127], "irout": [114, 115, 116], "rout": [116, 117, 118, 127], "load_extractor": 121, "load_modul": [122, 124], "load_module_as_extractor": 123, "register_rout": [125, 126], "streamer": [128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154], "basestreamerfil": [129, 130, 131, 132, 133, 134], "clean_up": [131, 137, 141, 145, 151], "get_file_nam": [132, 146, 152], "metadata_to_str": [133, 147, 153], "stream": [134, 138, 142, 148, 154], "istream": [135, 136, 137, 138], "streamerdummi": [139, 140, 141, 142], "streamerfilehtml": [143, 144, 145, 146, 147, 148], "streamerfilejson": [149, 150, 151, 152, 153, 154], "welcom": 155, "commoncrawl": [155, 159], "": 155, "document": 155, "indic": 155, "tabl": 155, "domain": 156, "jsonl": 156, "format": 156, "miscellan": 157, "program": [158, 159], "guid": [158, 159], "how": 159, "from": 159, "crawl": 159, "theori": 159, "1": 159, "queri": 159, "2": 159, "3": 159, "choos": 159, "4": 159, "out": 159, "web": 159, "page": 159, "5": 159, "field": 159, "6": 159, "save": 159, "put": 160, "all": 160, "togeth": 160, "usag": 161, "workflow": 161}, "envversion": {"sphinx.domains.c": 2, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 8, "sphinx.domains.index": 1, "sphinx.domains.javascript": 2, "sphinx.domains.math": 2, "sphinx.domains.python": 3, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "nbsphinx": 4, "sphinx": 57}, "alltitles": {"API": [[0, "api"]], "Command Line Interface": [[1, "command-line-interface"], [4, "command-line-interface"]], "Examples": [[1, "examples"], [2, "examples"], [3, "examples"]], "Command Line Download": [[2, "command-line-download"]], "Positional arguments": [[2, "positional-arguments"], [3, "positional-arguments"]], "Options": [[2, "options"]], "Record mode options": [[2, "record-mode-options"]], "Command line Extract": [[3, "command-line-extract"]], "Optional arguments": [[3, "optional-arguments"]], "Record arguments": [[3, "record-arguments"]], "Html arguments": [[3, "html-arguments"]], "Contents:": [[4, null], [7, null], [155, null], [157, null], [158, null]], "Extractor config file": [[5, "extractor-config-file"]], "Structure": [[5, "structure"]], "Example": [[5, "example"], [6, "example"]], "__init__.py": [[5, "init-py"]], "Arbitrary Code Execution": [[5, "arbitrary-code-execution"]], "Custom Extractor": [[6, "custom-extractor"]], "BaseExtractor": [[6, "baseextractor"]], "Extraction": [[6, "extraction"], [7, "extraction"], [8, "extraction"]], "Filtering": [[6, "filtering"], [8, "filtering"]], "Extraction utils": [[8, "extraction-utils"]], "cmoncrawl": [[9, "module-cmoncrawl"]], "cmoncrawl.aggregator": [[10, "module-cmoncrawl.aggregator"]], "cmoncrawl.aggregator.index_query": [[11, "module-cmoncrawl.aggregator.index_query"]], "cmoncrawl.aggregator.index_query.IndexAggregator": [[12, "cmoncrawl-aggregator-index-query-indexaggregator"]], "cmoncrawl.aggregator.index_query.IndexAggregator.__init__": [[13, "cmoncrawl-aggregator-index-query-indexaggregator-init"]], "cmoncrawl.aggregator.index_query.IndexAggregator.aclose": [[14, "cmoncrawl-aggregator-index-query-indexaggregator-aclose"]], "cmoncrawl.aggregator.index_query.IndexAggregator.aopen": [[15, "cmoncrawl-aggregator-index-query-indexaggregator-aopen"]], "cmoncrawl.aggregator.index_query.IndexAggregator.get_all_CC_indexes": [[16, "cmoncrawl-aggregator-index-query-indexaggregator-get-all-cc-indexes"]], "cmoncrawl.aggregator.index_query.IndexAggregator.get_captured_responses": [[17, "cmoncrawl-aggregator-index-query-indexaggregator-get-captured-responses"]], "cmoncrawl.aggregator.index_query.IndexAggregator.get_number_of_pages": [[18, "cmoncrawl-aggregator-index-query-indexaggregator-get-number-of-pages"]], "cmoncrawl.aggregator.utils": [[19, "module-cmoncrawl.aggregator.utils"]], "cmoncrawl.aggregator.utils.helpers": [[20, "module-cmoncrawl.aggregator.utils.helpers"]], "cmoncrawl.aggregator.utils.ndjson_decoder": [[21, "module-cmoncrawl.aggregator.utils.ndjson_decoder"]], "cmoncrawl.aggregator.utils.ndjson_decoder.Decoder": [[22, "cmoncrawl-aggregator-utils-ndjson-decoder-decoder"]], "cmoncrawl.aggregator.utils.ndjson_decoder.Decoder.__init__": [[23, "cmoncrawl-aggregator-utils-ndjson-decoder-decoder-init"]], "cmoncrawl.aggregator.utils.ndjson_decoder.Decoder.decode": [[24, "cmoncrawl-aggregator-utils-ndjson-decoder-decoder-decode"]], "cmoncrawl.aggregator.utils.ndjson_decoder.Decoder.raw_decode": [[25, "cmoncrawl-aggregator-utils-ndjson-decoder-decoder-raw-decode"]], "cmoncrawl.common": [[26, "module-cmoncrawl.common"]], "cmoncrawl.common.loggers": [[27, "module-cmoncrawl.common.loggers"]], "cmoncrawl.common.types": [[28, "module-cmoncrawl.common.types"]], "cmoncrawl.common.types.DomainCrawl": [[29, "cmoncrawl-common-types-domaincrawl"]], "cmoncrawl.common.types.DomainCrawl.__init__": [[30, "cmoncrawl-common-types-domaincrawl-init"]], "cmoncrawl.common.types.DomainRecord": [[31, "cmoncrawl-common-types-domainrecord"]], "cmoncrawl.common.types.DomainRecord.__init__": [[32, "cmoncrawl-common-types-domainrecord-init"]], "cmoncrawl.common.types.DomainRecord.from_dict": [[33, "cmoncrawl-common-types-domainrecord-from-dict"]], "cmoncrawl.common.types.DomainRecord.from_json": [[34, "cmoncrawl-common-types-domainrecord-from-json"]], "cmoncrawl.common.types.DomainRecord.schema": [[35, "cmoncrawl-common-types-domainrecord-schema"]], "cmoncrawl.common.types.DomainRecord.to_dict": [[36, "cmoncrawl-common-types-domainrecord-to-dict"]], "cmoncrawl.common.types.DomainRecord.to_json": [[37, "cmoncrawl-common-types-domainrecord-to-json"]], "cmoncrawl.common.types.ExtractConfig": [[38, "cmoncrawl-common-types-extractconfig"]], "cmoncrawl.common.types.ExtractConfig.__init__": [[39, "cmoncrawl-common-types-extractconfig-init"]], "cmoncrawl.common.types.ExtractConfig.from_dict": [[40, "cmoncrawl-common-types-extractconfig-from-dict"]], "cmoncrawl.common.types.ExtractConfig.from_json": [[41, "cmoncrawl-common-types-extractconfig-from-json"]], "cmoncrawl.common.types.ExtractConfig.schema": [[42, "cmoncrawl-common-types-extractconfig-schema"]], "cmoncrawl.common.types.ExtractConfig.to_dict": [[43, "cmoncrawl-common-types-extractconfig-to-dict"]], "cmoncrawl.common.types.ExtractConfig.to_json": [[44, "cmoncrawl-common-types-extractconfig-to-json"]], "cmoncrawl.common.types.ExtractorConfig": [[45, "cmoncrawl-common-types-extractorconfig"]], "cmoncrawl.common.types.ExtractorConfig.__init__": [[46, "cmoncrawl-common-types-extractorconfig-init"]], "cmoncrawl.common.types.ExtractorConfig.from_dict": [[47, "cmoncrawl-common-types-extractorconfig-from-dict"]], "cmoncrawl.common.types.ExtractorConfig.from_json": [[48, "cmoncrawl-common-types-extractorconfig-from-json"]], "cmoncrawl.common.types.ExtractorConfig.schema": [[49, "cmoncrawl-common-types-extractorconfig-schema"]], "cmoncrawl.common.types.ExtractorConfig.to_dict": [[50, "cmoncrawl-common-types-extractorconfig-to-dict"]], "cmoncrawl.common.types.ExtractorConfig.to_json": [[51, "cmoncrawl-common-types-extractorconfig-to-json"]], "cmoncrawl.common.types.PipeMetadata": [[52, "cmoncrawl-common-types-pipemetadata"]], "cmoncrawl.common.types.PipeMetadata.__init__": [[53, "cmoncrawl-common-types-pipemetadata-init"]], "cmoncrawl.common.types.RetrieveResponse": [[54, "cmoncrawl-common-types-retrieveresponse"]], "cmoncrawl.common.types.RetrieveResponse.__init__": [[55, "cmoncrawl-common-types-retrieveresponse-init"]], "cmoncrawl.common.types.RoutesConfig": [[56, "cmoncrawl-common-types-routesconfig"]], "cmoncrawl.common.types.RoutesConfig.__init__": [[57, "cmoncrawl-common-types-routesconfig-init"]], "cmoncrawl.common.types.RoutesConfig.from_dict": [[58, "cmoncrawl-common-types-routesconfig-from-dict"]], "cmoncrawl.common.types.RoutesConfig.from_json": [[59, "cmoncrawl-common-types-routesconfig-from-json"]], "cmoncrawl.common.types.RoutesConfig.schema": [[60, "cmoncrawl-common-types-routesconfig-schema"]], "cmoncrawl.common.types.RoutesConfig.to_dict": [[61, "cmoncrawl-common-types-routesconfig-to-dict"]], "cmoncrawl.common.types.RoutesConfig.to_json": [[62, "cmoncrawl-common-types-routesconfig-to-json"]], "cmoncrawl.processor": [[63, "module-cmoncrawl.processor"]], "cmoncrawl.processor.extraction": [[64, "module-cmoncrawl.processor.extraction"]], "cmoncrawl.processor.extraction.filters": [[65, "module-cmoncrawl.processor.extraction.filters"]], "cmoncrawl.processor.extraction.utils": [[66, "module-cmoncrawl.processor.extraction.utils"]], "cmoncrawl.processor.pipeline": [[67, "module-cmoncrawl.processor.pipeline"]], "cmoncrawl.processor.pipeline.downloader": [[68, "module-cmoncrawl.processor.pipeline.downloader"]], "cmoncrawl.processor.pipeline.downloader.AsyncDownloader": [[69, "cmoncrawl-processor-pipeline-downloader-asyncdownloader"]], "cmoncrawl.processor.pipeline.downloader.AsyncDownloader.__init__": [[70, "cmoncrawl-processor-pipeline-downloader-asyncdownloader-init"]], "cmoncrawl.processor.pipeline.downloader.AsyncDownloader.aclose": [[71, "cmoncrawl-processor-pipeline-downloader-asyncdownloader-aclose"]], "cmoncrawl.processor.pipeline.downloader.AsyncDownloader.aopen": [[72, "cmoncrawl-processor-pipeline-downloader-asyncdownloader-aopen"]], "cmoncrawl.processor.pipeline.downloader.AsyncDownloader.download": [[73, "cmoncrawl-processor-pipeline-downloader-asyncdownloader-download"]], "cmoncrawl.processor.pipeline.downloader.AsyncDownloader.unwrap": [[74, "cmoncrawl-processor-pipeline-downloader-asyncdownloader-unwrap"]], "cmoncrawl.processor.pipeline.downloader.DownloaderDummy": [[75, "cmoncrawl-processor-pipeline-downloader-downloaderdummy"]], "cmoncrawl.processor.pipeline.downloader.DownloaderDummy.__init__": [[76, "cmoncrawl-processor-pipeline-downloader-downloaderdummy-init"]], "cmoncrawl.processor.pipeline.downloader.DownloaderDummy.download": [[77, "cmoncrawl-processor-pipeline-downloader-downloaderdummy-download"]], "cmoncrawl.processor.pipeline.downloader.DownloaderDummy.extract_url": [[78, "cmoncrawl-processor-pipeline-downloader-downloaderdummy-extract-url"]], "cmoncrawl.processor.pipeline.downloader.DownloaderDummy.extract_year": [[79, "cmoncrawl-processor-pipeline-downloader-downloaderdummy-extract-year"]], "cmoncrawl.processor.pipeline.downloader.DownloaderDummy.mine_metadata": [[80, "cmoncrawl-processor-pipeline-downloader-downloaderdummy-mine-metadata"]], "cmoncrawl.processor.pipeline.downloader.IDownloader": [[81, "cmoncrawl-processor-pipeline-downloader-idownloader"]], "cmoncrawl.processor.pipeline.downloader.IDownloader.__init__": [[82, "cmoncrawl-processor-pipeline-downloader-idownloader-init"]], "cmoncrawl.processor.pipeline.downloader.IDownloader.download": [[83, "cmoncrawl-processor-pipeline-downloader-idownloader-download"]], "cmoncrawl.processor.pipeline.extractor": [[84, "module-cmoncrawl.processor.pipeline.extractor"]], "cmoncrawl.processor.pipeline.extractor.BaseExtractor": [[85, "cmoncrawl-processor-pipeline-extractor-baseextractor"]], "cmoncrawl.processor.pipeline.extractor.BaseExtractor.__init__": [[86, "cmoncrawl-processor-pipeline-extractor-baseextractor-init"]], "cmoncrawl.processor.pipeline.extractor.BaseExtractor.extract": [[87, "cmoncrawl-processor-pipeline-extractor-baseextractor-extract"]], "cmoncrawl.processor.pipeline.extractor.BaseExtractor.extract_soup": [[88, "cmoncrawl-processor-pipeline-extractor-baseextractor-extract-soup"]], "cmoncrawl.processor.pipeline.extractor.BaseExtractor.filter_raw": [[89, "cmoncrawl-processor-pipeline-extractor-baseextractor-filter-raw"]], "cmoncrawl.processor.pipeline.extractor.BaseExtractor.filter_soup": [[90, "cmoncrawl-processor-pipeline-extractor-baseextractor-filter-soup"]], "cmoncrawl.processor.pipeline.extractor.BaseExtractor.preprocess": [[91, "cmoncrawl-processor-pipeline-extractor-baseextractor-preprocess"]], "cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor": [[92, "cmoncrawl-processor-pipeline-extractor-domainrecordextractor"]], "cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.__init__": [[93, "cmoncrawl-processor-pipeline-extractor-domainrecordextractor-init"]], "cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.extract": [[94, "cmoncrawl-processor-pipeline-extractor-domainrecordextractor-extract"]], "cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.extract_soup": [[95, "cmoncrawl-processor-pipeline-extractor-domainrecordextractor-extract-soup"]], "cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.filter_raw": [[96, "cmoncrawl-processor-pipeline-extractor-domainrecordextractor-filter-raw"]], "cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.filter_soup": [[97, "cmoncrawl-processor-pipeline-extractor-domainrecordextractor-filter-soup"]], "cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.preprocess": [[98, "cmoncrawl-processor-pipeline-extractor-domainrecordextractor-preprocess"]], "cmoncrawl.processor.pipeline.extractor.HTMLExtractor": [[99, "cmoncrawl-processor-pipeline-extractor-htmlextractor"]], "cmoncrawl.processor.pipeline.extractor.HTMLExtractor.__init__": [[100, "cmoncrawl-processor-pipeline-extractor-htmlextractor-init"]], "cmoncrawl.processor.pipeline.extractor.HTMLExtractor.extract": [[101, "cmoncrawl-processor-pipeline-extractor-htmlextractor-extract"]], "cmoncrawl.processor.pipeline.extractor.HTMLExtractor.extract_soup": [[102, "cmoncrawl-processor-pipeline-extractor-htmlextractor-extract-soup"]], "cmoncrawl.processor.pipeline.extractor.HTMLExtractor.filter_raw": [[103, "cmoncrawl-processor-pipeline-extractor-htmlextractor-filter-raw"]], "cmoncrawl.processor.pipeline.extractor.HTMLExtractor.filter_soup": [[104, "cmoncrawl-processor-pipeline-extractor-htmlextractor-filter-soup"]], "cmoncrawl.processor.pipeline.extractor.HTMLExtractor.preprocess": [[105, "cmoncrawl-processor-pipeline-extractor-htmlextractor-preprocess"]], "cmoncrawl.processor.pipeline.extractor.IExtractor": [[106, "cmoncrawl-processor-pipeline-extractor-iextractor"]], "cmoncrawl.processor.pipeline.extractor.IExtractor.__init__": [[107, "cmoncrawl-processor-pipeline-extractor-iextractor-init"]], "cmoncrawl.processor.pipeline.extractor.IExtractor.extract": [[108, "cmoncrawl-processor-pipeline-extractor-iextractor-extract"]], "cmoncrawl.processor.pipeline.pipeline": [[109, "module-cmoncrawl.processor.pipeline.pipeline"]], "cmoncrawl.processor.pipeline.pipeline.ProcessorPipeline": [[110, "cmoncrawl-processor-pipeline-pipeline-processorpipeline"]], "cmoncrawl.processor.pipeline.pipeline.ProcessorPipeline.__init__": [[111, "cmoncrawl-processor-pipeline-pipeline-processorpipeline-init"]], "cmoncrawl.processor.pipeline.pipeline.ProcessorPipeline.process_domain_record": [[112, "cmoncrawl-processor-pipeline-pipeline-processorpipeline-process-domain-record"]], "cmoncrawl.processor.pipeline.router": [[113, "module-cmoncrawl.processor.pipeline.router"]], "cmoncrawl.processor.pipeline.router.IRouter": [[114, "cmoncrawl-processor-pipeline-router-irouter"]], "cmoncrawl.processor.pipeline.router.IRouter.__init__": [[115, "cmoncrawl-processor-pipeline-router-irouter-init"]], "cmoncrawl.processor.pipeline.router.IRouter.route": [[116, "cmoncrawl-processor-pipeline-router-irouter-route"]], "cmoncrawl.processor.pipeline.router.Route": [[117, "cmoncrawl-processor-pipeline-router-route"]], "cmoncrawl.processor.pipeline.router.Route.__init__": [[118, "cmoncrawl-processor-pipeline-router-route-init"]], "cmoncrawl.processor.pipeline.router.Router": [[119, "cmoncrawl-processor-pipeline-router-router"]], "cmoncrawl.processor.pipeline.router.Router.__init__": [[120, "cmoncrawl-processor-pipeline-router-router-init"]], "cmoncrawl.processor.pipeline.router.Router.load_extractor": [[121, "cmoncrawl-processor-pipeline-router-router-load-extractor"]], "cmoncrawl.processor.pipeline.router.Router.load_module": [[122, "cmoncrawl-processor-pipeline-router-router-load-module"]], "cmoncrawl.processor.pipeline.router.Router.load_module_as_extractor": [[123, "cmoncrawl-processor-pipeline-router-router-load-module-as-extractor"]], "cmoncrawl.processor.pipeline.router.Router.load_modules": [[124, "cmoncrawl-processor-pipeline-router-router-load-modules"]], "cmoncrawl.processor.pipeline.router.Router.register_route": [[125, "cmoncrawl-processor-pipeline-router-router-register-route"]], "cmoncrawl.processor.pipeline.router.Router.register_routes": [[126, "cmoncrawl-processor-pipeline-router-router-register-routes"]], "cmoncrawl.processor.pipeline.router.Router.route": [[127, "cmoncrawl-processor-pipeline-router-router-route"]], "cmoncrawl.processor.pipeline.streamer": [[128, "module-cmoncrawl.processor.pipeline.streamer"]], "cmoncrawl.processor.pipeline.streamer.BaseStreamerFile": [[129, "cmoncrawl-processor-pipeline-streamer-basestreamerfile"]], "cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.__init__": [[130, "cmoncrawl-processor-pipeline-streamer-basestreamerfile-init"]], "cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.clean_up": [[131, "cmoncrawl-processor-pipeline-streamer-basestreamerfile-clean-up"]], "cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.get_file_name": [[132, "cmoncrawl-processor-pipeline-streamer-basestreamerfile-get-file-name"]], "cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.metadata_to_string": [[133, "cmoncrawl-processor-pipeline-streamer-basestreamerfile-metadata-to-string"]], "cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.stream": [[134, "cmoncrawl-processor-pipeline-streamer-basestreamerfile-stream"]], "cmoncrawl.processor.pipeline.streamer.IStreamer": [[135, "cmoncrawl-processor-pipeline-streamer-istreamer"]], "cmoncrawl.processor.pipeline.streamer.IStreamer.__init__": [[136, "cmoncrawl-processor-pipeline-streamer-istreamer-init"]], "cmoncrawl.processor.pipeline.streamer.IStreamer.clean_up": [[137, "cmoncrawl-processor-pipeline-streamer-istreamer-clean-up"]], "cmoncrawl.processor.pipeline.streamer.IStreamer.stream": [[138, "cmoncrawl-processor-pipeline-streamer-istreamer-stream"]], "cmoncrawl.processor.pipeline.streamer.StreamerDummy": [[139, "cmoncrawl-processor-pipeline-streamer-streamerdummy"]], "cmoncrawl.processor.pipeline.streamer.StreamerDummy.__init__": [[140, "cmoncrawl-processor-pipeline-streamer-streamerdummy-init"]], "cmoncrawl.processor.pipeline.streamer.StreamerDummy.clean_up": [[141, "cmoncrawl-processor-pipeline-streamer-streamerdummy-clean-up"]], "cmoncrawl.processor.pipeline.streamer.StreamerDummy.stream": [[142, "cmoncrawl-processor-pipeline-streamer-streamerdummy-stream"]], "cmoncrawl.processor.pipeline.streamer.StreamerFileHTML": [[143, "cmoncrawl-processor-pipeline-streamer-streamerfilehtml"]], "cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.__init__": [[144, "cmoncrawl-processor-pipeline-streamer-streamerfilehtml-init"]], "cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.clean_up": [[145, "cmoncrawl-processor-pipeline-streamer-streamerfilehtml-clean-up"]], "cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.get_file_name": [[146, "cmoncrawl-processor-pipeline-streamer-streamerfilehtml-get-file-name"]], "cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.metadata_to_string": [[147, "cmoncrawl-processor-pipeline-streamer-streamerfilehtml-metadata-to-string"]], "cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.stream": [[148, "cmoncrawl-processor-pipeline-streamer-streamerfilehtml-stream"]], "cmoncrawl.processor.pipeline.streamer.StreamerFileJSON": [[149, "cmoncrawl-processor-pipeline-streamer-streamerfilejson"]], "cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.__init__": [[150, "cmoncrawl-processor-pipeline-streamer-streamerfilejson-init"]], "cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.clean_up": [[151, "cmoncrawl-processor-pipeline-streamer-streamerfilejson-clean-up"]], "cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.get_file_name": [[152, "cmoncrawl-processor-pipeline-streamer-streamerfilejson-get-file-name"]], "cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.metadata_to_string": [[153, "cmoncrawl-processor-pipeline-streamer-streamerfilejson-metadata-to-string"]], "cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.stream": [[154, "cmoncrawl-processor-pipeline-streamer-streamerfilejson-stream"]], "Welcome to CommonCrawl Extractor\u2019s documentation!": [[155, "welcome-to-commoncrawl-extractor-s-documentation"]], "Indices and tables": [[155, "indices-and-tables"]], "Domain Record": [[156, "domain-record"]], "Domain Record JSONL format": [[156, "domain-record-jsonl-format"]], "Miscellaneous": [[157, "miscellaneous"]], "Programming Guide": [[158, "programming-guide"], [159, "programming-guide"]], "How to extract from Common Crawl (theory)": [[159, "how-to-extract-from-common-crawl-theory"]], "1. Querying CommonCrawl": [[159, "querying-commoncrawl"]], "2. Downloading a file": [[159, "downloading-a-file"]], "3. Choose extractor": [[159, "choose-extractor"]], "4. Filtering out the web page": [[159, "filtering-out-the-web-page"]], "5. Extract fields from the page": [[159, "extract-fields-from-the-page"]], "6. File saving": [[159, "file-saving"]], "Custom Pipeline": [[160, "custom-pipeline"]], "Pipeline": [[160, "pipeline"]], "Putting it all together": [[160, "putting-it-all-together"]], "Usage": [[161, "usage"]], "Workflow": [[161, "workflow"]]}, "indexentries": {"cmoncrawl": [[9, "module-cmoncrawl"]], "module": [[9, "module-cmoncrawl"], [10, "module-cmoncrawl.aggregator"], [11, "module-cmoncrawl.aggregator.index_query"], [19, "module-cmoncrawl.aggregator.utils"], [20, "module-cmoncrawl.aggregator.utils.helpers"], [21, "module-cmoncrawl.aggregator.utils.ndjson_decoder"], [26, "module-cmoncrawl.common"], [27, "module-cmoncrawl.common.loggers"], [28, "module-cmoncrawl.common.types"], [63, "module-cmoncrawl.processor"], [64, "module-cmoncrawl.processor.extraction"], [65, "module-cmoncrawl.processor.extraction.filters"], [66, "module-cmoncrawl.processor.extraction.utils"], [67, "module-cmoncrawl.processor.pipeline"], [68, "module-cmoncrawl.processor.pipeline.downloader"], [84, "module-cmoncrawl.processor.pipeline.extractor"], [109, "module-cmoncrawl.processor.pipeline.pipeline"], [113, "module-cmoncrawl.processor.pipeline.router"], [128, "module-cmoncrawl.processor.pipeline.streamer"]], "cmoncrawl.aggregator": [[10, "module-cmoncrawl.aggregator"]], "cmoncrawl.aggregator.index_query": [[11, "module-cmoncrawl.aggregator.index_query"]], "indexaggregator (class in cmoncrawl.aggregator.index_query)": [[12, "cmoncrawl.aggregator.index_query.IndexAggregator"]], "__init__() (cmoncrawl.aggregator.index_query.indexaggregator method)": [[12, "cmoncrawl.aggregator.index_query.IndexAggregator.__init__"], [13, "cmoncrawl.aggregator.index_query.IndexAggregator.__init__"]], "aclose() (cmoncrawl.aggregator.index_query.indexaggregator method)": [[14, "cmoncrawl.aggregator.index_query.IndexAggregator.aclose"]], "aopen() (cmoncrawl.aggregator.index_query.indexaggregator method)": [[15, "cmoncrawl.aggregator.index_query.IndexAggregator.aopen"]], "get_all_cc_indexes() (cmoncrawl.aggregator.index_query.indexaggregator static method)": [[16, "cmoncrawl.aggregator.index_query.IndexAggregator.get_all_CC_indexes"]], "get_captured_responses() (cmoncrawl.aggregator.index_query.indexaggregator static method)": [[17, "cmoncrawl.aggregator.index_query.IndexAggregator.get_captured_responses"]], "get_number_of_pages() (cmoncrawl.aggregator.index_query.indexaggregator static method)": [[18, "cmoncrawl.aggregator.index_query.IndexAggregator.get_number_of_pages"]], "cmoncrawl.aggregator.utils": [[19, "module-cmoncrawl.aggregator.utils"]], "cmoncrawl.aggregator.utils.helpers": [[20, "module-cmoncrawl.aggregator.utils.helpers"]], "cmoncrawl.aggregator.utils.ndjson_decoder": [[21, "module-cmoncrawl.aggregator.utils.ndjson_decoder"]], "decoder (class in cmoncrawl.aggregator.utils.ndjson_decoder)": [[22, "cmoncrawl.aggregator.utils.ndjson_decoder.Decoder"]], "__init__() (cmoncrawl.aggregator.utils.ndjson_decoder.decoder method)": [[22, "cmoncrawl.aggregator.utils.ndjson_decoder.Decoder.__init__"], [23, "cmoncrawl.aggregator.utils.ndjson_decoder.Decoder.__init__"]], "decode() (cmoncrawl.aggregator.utils.ndjson_decoder.decoder method)": [[24, "cmoncrawl.aggregator.utils.ndjson_decoder.Decoder.decode"]], "raw_decode() (cmoncrawl.aggregator.utils.ndjson_decoder.decoder method)": [[25, "cmoncrawl.aggregator.utils.ndjson_decoder.Decoder.raw_decode"]], "cmoncrawl.common": [[26, "module-cmoncrawl.common"]], "cmoncrawl.common.loggers": [[27, "module-cmoncrawl.common.loggers"]], "cmoncrawl.common.types": [[28, "module-cmoncrawl.common.types"]], "domaincrawl (class in cmoncrawl.common.types)": [[29, "cmoncrawl.common.types.DomainCrawl"]], "__init__() (cmoncrawl.common.types.domaincrawl method)": [[29, "cmoncrawl.common.types.DomainCrawl.__init__"], [30, "cmoncrawl.common.types.DomainCrawl.__init__"]], "domainrecord (class in cmoncrawl.common.types)": [[31, "cmoncrawl.common.types.DomainRecord"]], "__init__() (cmoncrawl.common.types.domainrecord method)": [[31, "cmoncrawl.common.types.DomainRecord.__init__"], [32, "cmoncrawl.common.types.DomainRecord.__init__"]], "from_dict() (cmoncrawl.common.types.domainrecord class method)": [[33, "cmoncrawl.common.types.DomainRecord.from_dict"]], "from_json() (cmoncrawl.common.types.domainrecord class method)": [[34, "cmoncrawl.common.types.DomainRecord.from_json"]], "schema() (cmoncrawl.common.types.domainrecord class method)": [[35, "cmoncrawl.common.types.DomainRecord.schema"]], "to_dict() (cmoncrawl.common.types.domainrecord method)": [[36, "cmoncrawl.common.types.DomainRecord.to_dict"]], "to_json() (cmoncrawl.common.types.domainrecord method)": [[37, "cmoncrawl.common.types.DomainRecord.to_json"]], "extractconfig (class in cmoncrawl.common.types)": [[38, "cmoncrawl.common.types.ExtractConfig"]], "__init__() (cmoncrawl.common.types.extractconfig method)": [[38, "cmoncrawl.common.types.ExtractConfig.__init__"], [39, "cmoncrawl.common.types.ExtractConfig.__init__"]], "from_dict() (cmoncrawl.common.types.extractconfig class method)": [[40, "cmoncrawl.common.types.ExtractConfig.from_dict"]], "from_json() (cmoncrawl.common.types.extractconfig class method)": [[41, "cmoncrawl.common.types.ExtractConfig.from_json"]], "schema() (cmoncrawl.common.types.extractconfig class method)": [[42, "cmoncrawl.common.types.ExtractConfig.schema"]], "to_dict() (cmoncrawl.common.types.extractconfig method)": [[43, "cmoncrawl.common.types.ExtractConfig.to_dict"]], "to_json() (cmoncrawl.common.types.extractconfig method)": [[44, "cmoncrawl.common.types.ExtractConfig.to_json"]], "extractorconfig (class in cmoncrawl.common.types)": [[45, "cmoncrawl.common.types.ExtractorConfig"]], "__init__() (cmoncrawl.common.types.extractorconfig method)": [[45, "cmoncrawl.common.types.ExtractorConfig.__init__"], [46, "cmoncrawl.common.types.ExtractorConfig.__init__"]], "from_dict() (cmoncrawl.common.types.extractorconfig class method)": [[47, "cmoncrawl.common.types.ExtractorConfig.from_dict"]], "from_json() (cmoncrawl.common.types.extractorconfig class method)": [[48, "cmoncrawl.common.types.ExtractorConfig.from_json"]], "schema() (cmoncrawl.common.types.extractorconfig class method)": [[49, "cmoncrawl.common.types.ExtractorConfig.schema"]], "to_dict() (cmoncrawl.common.types.extractorconfig method)": [[50, "cmoncrawl.common.types.ExtractorConfig.to_dict"]], "to_json() (cmoncrawl.common.types.extractorconfig method)": [[51, "cmoncrawl.common.types.ExtractorConfig.to_json"]], "pipemetadata (class in cmoncrawl.common.types)": [[52, "cmoncrawl.common.types.PipeMetadata"]], "__init__() (cmoncrawl.common.types.pipemetadata method)": [[52, "cmoncrawl.common.types.PipeMetadata.__init__"], [53, "cmoncrawl.common.types.PipeMetadata.__init__"]], "retrieveresponse (class in cmoncrawl.common.types)": [[54, "cmoncrawl.common.types.RetrieveResponse"]], "__init__() (cmoncrawl.common.types.retrieveresponse method)": [[54, "cmoncrawl.common.types.RetrieveResponse.__init__"], [55, "cmoncrawl.common.types.RetrieveResponse.__init__"]], "routesconfig (class in cmoncrawl.common.types)": [[56, "cmoncrawl.common.types.RoutesConfig"]], "__init__() (cmoncrawl.common.types.routesconfig method)": [[56, "cmoncrawl.common.types.RoutesConfig.__init__"], [57, "cmoncrawl.common.types.RoutesConfig.__init__"]], "from_dict() (cmoncrawl.common.types.routesconfig class method)": [[58, "cmoncrawl.common.types.RoutesConfig.from_dict"]], "from_json() (cmoncrawl.common.types.routesconfig class method)": [[59, "cmoncrawl.common.types.RoutesConfig.from_json"]], "schema() (cmoncrawl.common.types.routesconfig class method)": [[60, "cmoncrawl.common.types.RoutesConfig.schema"]], "to_dict() (cmoncrawl.common.types.routesconfig method)": [[61, "cmoncrawl.common.types.RoutesConfig.to_dict"]], "to_json() (cmoncrawl.common.types.routesconfig method)": [[62, "cmoncrawl.common.types.RoutesConfig.to_json"]], "cmoncrawl.processor": [[63, "module-cmoncrawl.processor"]], "cmoncrawl.processor.extraction": [[64, "module-cmoncrawl.processor.extraction"]], "cmoncrawl.processor.extraction.filters": [[65, "module-cmoncrawl.processor.extraction.filters"]], "cmoncrawl.processor.extraction.utils": [[66, "module-cmoncrawl.processor.extraction.utils"]], "cmoncrawl.processor.pipeline": [[67, "module-cmoncrawl.processor.pipeline"]], "cmoncrawl.processor.pipeline.downloader": [[68, "module-cmoncrawl.processor.pipeline.downloader"]], "asyncdownloader (class in cmoncrawl.processor.pipeline.downloader)": [[69, "cmoncrawl.processor.pipeline.downloader.AsyncDownloader"]], "__init__() (cmoncrawl.processor.pipeline.downloader.asyncdownloader method)": [[69, "cmoncrawl.processor.pipeline.downloader.AsyncDownloader.__init__"], [70, "cmoncrawl.processor.pipeline.downloader.AsyncDownloader.__init__"]], "aclose() (cmoncrawl.processor.pipeline.downloader.asyncdownloader method)": [[71, "cmoncrawl.processor.pipeline.downloader.AsyncDownloader.aclose"]], "aopen() (cmoncrawl.processor.pipeline.downloader.asyncdownloader method)": [[72, "cmoncrawl.processor.pipeline.downloader.AsyncDownloader.aopen"]], "download() (cmoncrawl.processor.pipeline.downloader.asyncdownloader method)": [[73, "cmoncrawl.processor.pipeline.downloader.AsyncDownloader.download"]], "unwrap() (cmoncrawl.processor.pipeline.downloader.asyncdownloader method)": [[74, "cmoncrawl.processor.pipeline.downloader.AsyncDownloader.unwrap"]], "downloaderdummy (class in cmoncrawl.processor.pipeline.downloader)": [[75, "cmoncrawl.processor.pipeline.downloader.DownloaderDummy"]], "__init__() (cmoncrawl.processor.pipeline.downloader.downloaderdummy method)": [[75, "cmoncrawl.processor.pipeline.downloader.DownloaderDummy.__init__"], [76, "cmoncrawl.processor.pipeline.downloader.DownloaderDummy.__init__"]], "download() (cmoncrawl.processor.pipeline.downloader.downloaderdummy method)": [[77, "cmoncrawl.processor.pipeline.downloader.DownloaderDummy.download"]], "extract_url() (cmoncrawl.processor.pipeline.downloader.downloaderdummy method)": [[78, "cmoncrawl.processor.pipeline.downloader.DownloaderDummy.extract_url"]], "extract_year() (cmoncrawl.processor.pipeline.downloader.downloaderdummy method)": [[79, "cmoncrawl.processor.pipeline.downloader.DownloaderDummy.extract_year"]], "mine_metadata() (cmoncrawl.processor.pipeline.downloader.downloaderdummy method)": [[80, "cmoncrawl.processor.pipeline.downloader.DownloaderDummy.mine_metadata"]], "idownloader (class in cmoncrawl.processor.pipeline.downloader)": [[81, "cmoncrawl.processor.pipeline.downloader.IDownloader"]], "__init__() (cmoncrawl.processor.pipeline.downloader.idownloader method)": [[81, "cmoncrawl.processor.pipeline.downloader.IDownloader.__init__"], [82, "cmoncrawl.processor.pipeline.downloader.IDownloader.__init__"]], "download() (cmoncrawl.processor.pipeline.downloader.idownloader method)": [[83, "cmoncrawl.processor.pipeline.downloader.IDownloader.download"]], "cmoncrawl.processor.pipeline.extractor": [[84, "module-cmoncrawl.processor.pipeline.extractor"]], "baseextractor (class in cmoncrawl.processor.pipeline.extractor)": [[85, "cmoncrawl.processor.pipeline.extractor.BaseExtractor"]], "__init__() (cmoncrawl.processor.pipeline.extractor.baseextractor method)": [[85, "cmoncrawl.processor.pipeline.extractor.BaseExtractor.__init__"], [86, "cmoncrawl.processor.pipeline.extractor.BaseExtractor.__init__"]], "extract() (cmoncrawl.processor.pipeline.extractor.baseextractor method)": [[87, "cmoncrawl.processor.pipeline.extractor.BaseExtractor.extract"]], "extract_soup() (cmoncrawl.processor.pipeline.extractor.baseextractor method)": [[88, "cmoncrawl.processor.pipeline.extractor.BaseExtractor.extract_soup"]], "filter_raw() (cmoncrawl.processor.pipeline.extractor.baseextractor method)": [[89, "cmoncrawl.processor.pipeline.extractor.BaseExtractor.filter_raw"]], "filter_soup() (cmoncrawl.processor.pipeline.extractor.baseextractor method)": [[90, "cmoncrawl.processor.pipeline.extractor.BaseExtractor.filter_soup"]], "preprocess() (cmoncrawl.processor.pipeline.extractor.baseextractor method)": [[91, "cmoncrawl.processor.pipeline.extractor.BaseExtractor.preprocess"]], "domainrecordextractor (class in cmoncrawl.processor.pipeline.extractor)": [[92, "cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor"]], "__init__() (cmoncrawl.processor.pipeline.extractor.domainrecordextractor method)": [[92, "cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.__init__"], [93, "cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.__init__"]], "extract() (cmoncrawl.processor.pipeline.extractor.domainrecordextractor method)": [[94, "cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.extract"]], "extract_soup() (cmoncrawl.processor.pipeline.extractor.domainrecordextractor method)": [[95, "cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.extract_soup"]], "filter_raw() (cmoncrawl.processor.pipeline.extractor.domainrecordextractor method)": [[96, "cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.filter_raw"]], "filter_soup() (cmoncrawl.processor.pipeline.extractor.domainrecordextractor method)": [[97, "cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.filter_soup"]], "preprocess() (cmoncrawl.processor.pipeline.extractor.domainrecordextractor method)": [[98, "cmoncrawl.processor.pipeline.extractor.DomainRecordExtractor.preprocess"]], "htmlextractor (class in cmoncrawl.processor.pipeline.extractor)": [[99, "cmoncrawl.processor.pipeline.extractor.HTMLExtractor"]], "__init__() (cmoncrawl.processor.pipeline.extractor.htmlextractor method)": [[99, "cmoncrawl.processor.pipeline.extractor.HTMLExtractor.__init__"], [100, "cmoncrawl.processor.pipeline.extractor.HTMLExtractor.__init__"]], "extract() (cmoncrawl.processor.pipeline.extractor.htmlextractor method)": [[101, "cmoncrawl.processor.pipeline.extractor.HTMLExtractor.extract"]], "extract_soup() (cmoncrawl.processor.pipeline.extractor.htmlextractor method)": [[102, "cmoncrawl.processor.pipeline.extractor.HTMLExtractor.extract_soup"]], "filter_raw() (cmoncrawl.processor.pipeline.extractor.htmlextractor method)": [[103, "cmoncrawl.processor.pipeline.extractor.HTMLExtractor.filter_raw"]], "filter_soup() (cmoncrawl.processor.pipeline.extractor.htmlextractor method)": [[104, "cmoncrawl.processor.pipeline.extractor.HTMLExtractor.filter_soup"]], "preprocess() (cmoncrawl.processor.pipeline.extractor.htmlextractor method)": [[105, "cmoncrawl.processor.pipeline.extractor.HTMLExtractor.preprocess"]], "iextractor (class in cmoncrawl.processor.pipeline.extractor)": [[106, "cmoncrawl.processor.pipeline.extractor.IExtractor"]], "__init__() (cmoncrawl.processor.pipeline.extractor.iextractor method)": [[106, "cmoncrawl.processor.pipeline.extractor.IExtractor.__init__"], [107, "cmoncrawl.processor.pipeline.extractor.IExtractor.__init__"]], "extract() (cmoncrawl.processor.pipeline.extractor.iextractor method)": [[108, "cmoncrawl.processor.pipeline.extractor.IExtractor.extract"]], "cmoncrawl.processor.pipeline.pipeline": [[109, "module-cmoncrawl.processor.pipeline.pipeline"]], "processorpipeline (class in cmoncrawl.processor.pipeline.pipeline)": [[110, "cmoncrawl.processor.pipeline.pipeline.ProcessorPipeline"]], "__init__() (cmoncrawl.processor.pipeline.pipeline.processorpipeline method)": [[110, "cmoncrawl.processor.pipeline.pipeline.ProcessorPipeline.__init__"], [111, "cmoncrawl.processor.pipeline.pipeline.ProcessorPipeline.__init__"]], "process_domain_record() (cmoncrawl.processor.pipeline.pipeline.processorpipeline method)": [[112, "cmoncrawl.processor.pipeline.pipeline.ProcessorPipeline.process_domain_record"]], "cmoncrawl.processor.pipeline.router": [[113, "module-cmoncrawl.processor.pipeline.router"]], "irouter (class in cmoncrawl.processor.pipeline.router)": [[114, "cmoncrawl.processor.pipeline.router.IRouter"]], "__init__() (cmoncrawl.processor.pipeline.router.irouter method)": [[114, "cmoncrawl.processor.pipeline.router.IRouter.__init__"], [115, "cmoncrawl.processor.pipeline.router.IRouter.__init__"]], "route() (cmoncrawl.processor.pipeline.router.irouter method)": [[116, "cmoncrawl.processor.pipeline.router.IRouter.route"]], "route (class in cmoncrawl.processor.pipeline.router)": [[117, "cmoncrawl.processor.pipeline.router.Route"]], "__init__() (cmoncrawl.processor.pipeline.router.route method)": [[117, "cmoncrawl.processor.pipeline.router.Route.__init__"], [118, "cmoncrawl.processor.pipeline.router.Route.__init__"]], "router (class in cmoncrawl.processor.pipeline.router)": [[119, "cmoncrawl.processor.pipeline.router.Router"]], "__init__() (cmoncrawl.processor.pipeline.router.router method)": [[119, "cmoncrawl.processor.pipeline.router.Router.__init__"], [120, "cmoncrawl.processor.pipeline.router.Router.__init__"]], "load_extractor() (cmoncrawl.processor.pipeline.router.router method)": [[121, "cmoncrawl.processor.pipeline.router.Router.load_extractor"]], "load_module() (cmoncrawl.processor.pipeline.router.router method)": [[122, "cmoncrawl.processor.pipeline.router.Router.load_module"]], "load_module_as_extractor() (cmoncrawl.processor.pipeline.router.router method)": [[123, "cmoncrawl.processor.pipeline.router.Router.load_module_as_extractor"]], "load_modules() (cmoncrawl.processor.pipeline.router.router method)": [[124, "cmoncrawl.processor.pipeline.router.Router.load_modules"]], "register_route() (cmoncrawl.processor.pipeline.router.router method)": [[125, "cmoncrawl.processor.pipeline.router.Router.register_route"]], "register_routes() (cmoncrawl.processor.pipeline.router.router method)": [[126, "cmoncrawl.processor.pipeline.router.Router.register_routes"]], "route() (cmoncrawl.processor.pipeline.router.router method)": [[127, "cmoncrawl.processor.pipeline.router.Router.route"]], "cmoncrawl.processor.pipeline.streamer": [[128, "module-cmoncrawl.processor.pipeline.streamer"]], "basestreamerfile (class in cmoncrawl.processor.pipeline.streamer)": [[129, "cmoncrawl.processor.pipeline.streamer.BaseStreamerFile"]], "__init__() (cmoncrawl.processor.pipeline.streamer.basestreamerfile method)": [[129, "cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.__init__"], [130, "cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.__init__"]], "clean_up() (cmoncrawl.processor.pipeline.streamer.basestreamerfile method)": [[131, "cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.clean_up"]], "get_file_name() (cmoncrawl.processor.pipeline.streamer.basestreamerfile method)": [[132, "cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.get_file_name"]], "metadata_to_string() (cmoncrawl.processor.pipeline.streamer.basestreamerfile method)": [[133, "cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.metadata_to_string"]], "stream() (cmoncrawl.processor.pipeline.streamer.basestreamerfile method)": [[134, "cmoncrawl.processor.pipeline.streamer.BaseStreamerFile.stream"]], "istreamer (class in cmoncrawl.processor.pipeline.streamer)": [[135, "cmoncrawl.processor.pipeline.streamer.IStreamer"]], "__init__() (cmoncrawl.processor.pipeline.streamer.istreamer method)": [[135, "cmoncrawl.processor.pipeline.streamer.IStreamer.__init__"], [136, "cmoncrawl.processor.pipeline.streamer.IStreamer.__init__"]], "clean_up() (cmoncrawl.processor.pipeline.streamer.istreamer method)": [[137, "cmoncrawl.processor.pipeline.streamer.IStreamer.clean_up"]], "stream() (cmoncrawl.processor.pipeline.streamer.istreamer method)": [[138, "cmoncrawl.processor.pipeline.streamer.IStreamer.stream"]], "streamerdummy (class in cmoncrawl.processor.pipeline.streamer)": [[139, "cmoncrawl.processor.pipeline.streamer.StreamerDummy"]], "__init__() (cmoncrawl.processor.pipeline.streamer.streamerdummy method)": [[139, "cmoncrawl.processor.pipeline.streamer.StreamerDummy.__init__"], [140, "cmoncrawl.processor.pipeline.streamer.StreamerDummy.__init__"]], "clean_up() (cmoncrawl.processor.pipeline.streamer.streamerdummy method)": [[141, "cmoncrawl.processor.pipeline.streamer.StreamerDummy.clean_up"]], "stream() (cmoncrawl.processor.pipeline.streamer.streamerdummy method)": [[142, "cmoncrawl.processor.pipeline.streamer.StreamerDummy.stream"]], "streamerfilehtml (class in cmoncrawl.processor.pipeline.streamer)": [[143, "cmoncrawl.processor.pipeline.streamer.StreamerFileHTML"]], "__init__() (cmoncrawl.processor.pipeline.streamer.streamerfilehtml method)": [[143, "cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.__init__"], [144, "cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.__init__"]], "clean_up() (cmoncrawl.processor.pipeline.streamer.streamerfilehtml method)": [[145, "cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.clean_up"]], "get_file_name() (cmoncrawl.processor.pipeline.streamer.streamerfilehtml method)": [[146, "cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.get_file_name"]], "metadata_to_string() (cmoncrawl.processor.pipeline.streamer.streamerfilehtml method)": [[147, "cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.metadata_to_string"]], "stream() (cmoncrawl.processor.pipeline.streamer.streamerfilehtml method)": [[148, "cmoncrawl.processor.pipeline.streamer.StreamerFileHTML.stream"]], "streamerfilejson (class in cmoncrawl.processor.pipeline.streamer)": [[149, "cmoncrawl.processor.pipeline.streamer.StreamerFileJSON"]], "__init__() (cmoncrawl.processor.pipeline.streamer.streamerfilejson method)": [[149, "cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.__init__"], [150, "cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.__init__"]], "clean_up() (cmoncrawl.processor.pipeline.streamer.streamerfilejson method)": [[151, "cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.clean_up"]], "get_file_name() (cmoncrawl.processor.pipeline.streamer.streamerfilejson method)": [[152, "cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.get_file_name"]], "metadata_to_string() (cmoncrawl.processor.pipeline.streamer.streamerfilejson method)": [[153, "cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.metadata_to_string"]], "stream() (cmoncrawl.processor.pipeline.streamer.streamerfilejson method)": [[154, "cmoncrawl.processor.pipeline.streamer.StreamerFileJSON.stream"]]}})
\ No newline at end of file
diff --git a/docs/build/html/usage.html b/docs/build/html/usage.html
new file mode 100644
index 00000000..232c40d3
--- /dev/null
+++ b/docs/build/html/usage.html
@@ -0,0 +1,536 @@
+
+
+
+
+
+
+
+
+ Usage — CmonCrawl 1.0.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Usage
+
+
+
+
+
+
Contents
+
+
+
+
+
+
+
+
+
+
+Usage
+The library is designed to make interaction with CommonCrawl’s indexes simple,
+while also providing a framework for extracting data from the downloaded
+HTMLs.
+You can use the library in two ways:
+
+Command Line Interface - This should suffice for 99% of the use cases.
+Custom Pipeline - If you need more control over the process, you can use the library programmatically.
+
+
+Workflow
+The workflow is two-step:
+
+First download domain records (see Domain Record) from the indexes.
+Extract the domain records.
+
+
+
Note
+
This will further allow you to share the domain records with others,
+so that you will not run into author law issues.
+
+
+
Note
+
First step can be skipped by using AWS Athena, which is under
+current cirmustances (CommonCrawl api is completely throttled, slow and dropping most of requests),
+the prefered way. See How to get records from AWS Athena.
+It’s also super cheap.
+
+To create your custom extractors you likely want to download HTMLs not domain records.
+Both download to HTML and extraction from HTML is also supported in parallel to the domain record workflow.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file