-
Notifications
You must be signed in to change notification settings - Fork 8
Adding WebScrappers
WebScrap can search for configureds webs, detect new files to downloads, check validity (max file size, non existent and string exclude) and add to downloads.
Webs can be configured in config.ws.php
and used in MENU->PASTE LINKS
and MENU->SCRAPPERWEB
.
When links are sended are checked by type and added (elinks and magnets), or checked for know urls in scrappers and sended to try to extract download.
Search for downloads in selected scrapper and try to extract download url by type (elinks, magnets, torrents and dd).
Example for examplesite.biz
where:
- Its UTF-8
- Magnets downloads
- Checking duplicates
- Checking max file size (extracted from page)
- Movies url: examplesite.biz/movie/*
- Series url: examplesite.biz/serie/*
- Season url: examplesite.biz/* /season/*
- Episode url: examplesite.biz/* /episode/*
//Example web site for magnet
'examplesite' => array(
//Type: torrent|amule|magnets
'type' => 'magnets',
//Title: domain.com
'title' => 'examplesite.biz',
//Pass needed to get torrent/amule, from base page search, 1 pass if torrent/amule in next, 2 if hava second page to link, ...
'passnumber' => 3,
//HTML Code Format: UTF-8, ANSI, ...
'htmlformat' => 'UTF-8',
//Check Duplicates: search if file media title exist and cancel download
'duplicatescheck' => TRUE,
//Title Clean, remove strings from title for duplicates scan
'titleclean' => array(
':',
'1080',
'720',
'1080p',
'720p',
'..',
'...',
),
//Search Data In Web
'searchdata' => array(
//Web URL to search: 'http://examplesite.biz/?s='
'urlsearch' => 'https://www.examplesite.biz/?s=',
//Web URL to baselist: 'examplesite.com/?q='
'urlbase' => 'https://www.examplesite.biz/',
//URL Append to links: add to links for incomplete URLs: domain.com/
'linksappend' => 'https://www.examplesite.biz',
//html object have links: a or function
'linksobject' => 'a',
//alternative mode to get title
'linkstitle' => array( 'mode', param1, param2, ... ),
//Modes
'linkstitle' => array( 'inhtml', (-/+)sizetosearch, pre-text, posttext ),
'linkstitle' => array( 'inurl', pre-text, posttext )
//Extract image from search and use on list
'linksimage' => array( 'mode', param1, param2, ... ),
//Modes
'linksimage' => array( 'inhtml', (-/+)sizetosearch, pre-text, posttext ),
'linksimage' => array( 'near' ),
//String needed in linkTitle to be valid
'linktitleneeded' => array(),
//String needed in linkURL to be valid
'linkurlneeded' => array(
'/series/',
'/movies/',
'/season/',
),
//String Exclude in linkTitle to be valid
'linktitleexclude' => array(
'4K',
'v.o.s.e',
),
//String Exclude in linkURL to be valid
'linkurlexclude' => array(
'v.o.s.e',
),
//FILTER SIZE
//Max File Size: 0 disabled|X megabytes
'filtersizemax' => 2500,
//FILTER SIZE: get size between text: textpre + XX.XX Gb|Mb + textpos
'filtersizetextpre' => '<span class="size">',
'filtersizetextpos' => '</span>',
//FILTER SIZE: max distance from link (+:fordward, -:backward)
'filtersizetextdistance' => 1000,
//FILTER SIZE: especific size(MB)=function( $html )
'filtersizefunction' => '',
),
//Pass Config
'passdata' => array(
//Pass 1 Movies
0 => array(
//Needed In URL to be valid, if pass not valid search for valid pass to launch
'urlvalid' => '/movies/',
//Next pass: int|FALSE, if FALSE try to download file
'passnext' => 4,
//URL Append to links: add to links for incomplete URLs: domain.com/
'linksappend' => '',
//html object have links: a
'linksobject' => 'a',
//String needed in linkTitle to be valid
'linktitleneeded' => array(
'Downloads'
),
//String needed in linkURL to be valid
'linkurlneeded' => array(
'magnet'
),
//String Exclude in linkTitle to be valid
'linktitleexclude' => array(
'.rar',
'.zip',
'.7z',
'VOSE',
'V.O.S.E.',
'vose',
'v.o.s.e',
),
//String Exclude in linkURL to be valid
'linkurlexclude' => array(
'.rar',
'.zip',
'.7z',
'VOSE',
'V.O.S.E.',
'vose',
'v.o.s.e',
),
//FILTER SIZE
//Max File Size: 0 disabled|X megabytes
'filtersizemax' => 2500,
//FILTER SIZE: get size between text: textpre + XX.XX Gb|Mb + textpos
'filtersizetextpre' => '<span class="size">',
'filtersizetextpos' => '</span> ',
//FILTER SIZE: especific size(MB)=function( $html )
'filtersizefunction' => '',
//DOWNLOAD MULTIPLE
'downloadmultiple' => FALSE,
//DOWNLOAD function
'downloadfunction' => FALSE,
),
//Pass 2 SERIES
1 => array(
//Needed In URL to be valid, if pass not valid search for valid pass to launch
'urlvalid' => '/series/',
//Next pass: int|FALSE, if FALSE try to download file
'passnext' => 2,
//URL Append to links: add to links for incomplete URLs: domain.com/
'linksappend' => '',
//html object have links: a
'linksobject' => 'a',
//String needed in linkTitle to be valid
'linktitleneeded' => array(
'Download'
),
//String needed in linkURL to be valid
'linkurlneeded' => array(
'magnet'
),
//String Exclude in linkTitle to be valid
'linktitleexclude' => array(
'VOSE',
'V.O.S.E.',
'vose',
'v.o.s.e',
),
//String Exclude in linkURL to be valid
'linkurlexclude' => array(
'VOSE',
'V.O.S.E.',
'vose',
'v.o.s.e',
),
//FILTER SIZE
//Max File Size: 0 disabled|X megabytes
'filtersizemax' => 0,
//FILTER SIZE: get size between text: textpre + XX.XX Gb|Mb + textpos
'filtersizetextpre' => '<size class="size">',
'filtersizetextpos' => '</span> ',
//FILTER SIZE: especific size(MB)=function( $html )
'filtersizefunction' => '',
//DOWNLOAD MULTIPLE
'downloadmultiple' => FALSE,
//DOWNLOAD function
'downloadfunction' => FALSE,
),
//Pass 2 Season
2 => array(
//Needed In URL to be valid, if pass not valid search for valid pass to launch
'urlvalid' => '/season/',
//Next pass: int|FALSE, if FALSE try to download file
'passnext' => 3,
//URL Append to links: add to links for incomplete URLs: domain.com/
'linksappend' => '',
//html object have links: a
'linksobject' => 'a',
//String needed in linkTitle to be valid
'linktitleneeded' => array(
'Episode'
),
//String needed in linkURL to be valid
'linkurlneeded' => array(
'/episode/'
),
//String Exclude in linkTitle to be valid
'linktitleexclude' => array(
'VOSE',
'V.O.S.E.',
'vose',
'v.o.s.e',
),
//String Exclude in linkURL to be valid
'linkurlexclude' => array(
'VOSE',
'V.O.S.E.',
'vose',
'v.o.s.e',
),
//FILTER SIZE
//Max File Size: 0 disabled|X megabytes
'filtersizemax' => 0,
//FILTER SIZE: get size between text: textpre + XX.XX Gb|Mb + textpos
'filtersizetextpre' => '<span class="size">',
'filtersizetextpos' => '</span> ',
//FILTER SIZE: especific size(MB)=function( $html )
'filtersizefunction' => '',
//DOWNLOAD MULTIPLE
'downloadmultiple' => TRUE,
//DOWNLOAD function
'downloadfunction' => FALSE,
),
//Pass 3 Episode
3 => array(
//Needed In URL to be valid, if pass not valid search for valid pass to launch
'urlvalid' => '/episode/',
//Next pass: int|FALSE, if FALSE try to download file
'passnext' => 4,
//URL Append to links: add to links for incomplete URLs: domain.com/
'linksappend' => '',
//html object have links: a or function
'linksobject' => 'a',
//String needed in linkTitle to be valid
'linktitleneeded' => array(
'Download'
),
//String needed in linkURL to be valid
'linkurlneeded' => array(
'magnet'
),
//String Exclude in linkTitle to be valid
'linktitleexclude' => array(
'VOSE',
'V.O.S.E.',
'vose',
'v.o.s.e',
),
//String Exclude in linkURL to be valid
'linkurlexclude' => array(
'VOSE',
'V.O.S.E.',
'vose',
'v.o.s.e',
),
//FILTER SIZE
//Max File Size: 0 disabled|X megabytes
'filtersizemax' => 0,
//FILTER SIZE: get size between text: textpre + XX.XX Gb|Mb + textpos
'filtersizetextpre' => '<span class="size">',
'filtersizetextpos' => '</span> ',
//FILTER SIZE: especific size(MB)=function( $html )
'filtersizefunction' => '',
//DOWNLOAD MULTIPLE
'downloadmultiple' => FALSE,
//DOWNLOAD function
'downloadfunction' => FALSE,
),
//Pass 4 Add magnet
4 => array(
//Needed In URL to be valid, if pass not valid search for valid pass to launch
'urlvalid' => 'magnet:',
//Next pass: int|FALSE, if FALSE try to download file
'passnext' => FALSE,
//URL Append to links: add to links for incomplete URLs: domain.com/
'linksappend' => '',
//html object have links: a
'linksobject' => 'a',
//String needed in linkTitle to be valid
'linktitleneeded' => array(
),
//String needed in linkURL to be valid
'linkurlneeded' => array(
),
//String Exclude in linkTitle to be valid
'linktitleexclude' => array(
'.rar',
'.zip',
'.7z',
'VOSE',
'V.O.S.E.',
'vose',
'v.o.s.e',
),
//String Exclude in linkURL to be valid
'linkurlexclude' => array(
'.rar',
'.zip',
'.7z',
'VOSE',
'V.O.S.E.',
'vose',
'v.o.s.e',
),
//FILTER SIZE
//Max File Size: 0 disabled|X megabytes
'filtersizemax' => 0,
//FILTER SIZE: get size between text: textpre + XX.XX Gb|Mb + textpos
'filtersizetextpre' => '',
'filtersizetextpos' => '',
//FILTER SIZE: especific size(MB)=function( $html )
'filtersizefunction' => '',
//DOWNLOAD MULTIPLE
'downloadmultiple' => FALSE,
//DOWNLOAD function
'downloadfunction' => FALSE,
),
),
),
3 zones:
- Base config of scrapper: titles and all passes configs
- search config: params to be used on search on web page
-
passes
to extract donwload link: these are all the checks needed to go to download link. Eachpass
is defined with string to identify url, extract links and check size and duplicated and nextpass
to send after excluding/including links by strings defined for titles and urls. Lastpass
ispassnext=FALSE
to the download action based on type or defined by function.
This is the basic web config zone. Define:
- title in list for
MENU->SCRAPPWEB
- type of download (bypassed by
downloadfunction
on lastpass
), types are: torrent|amule|magnets and autoselect add method to downloads - pasnumber: define last
pass
to go direct to download type (not needed but can be defined) - htmlformat: code to be used to read html on page, needed if other than UTF8
- checkduplicates: check for title exist and cancel download
- titleclean: basic clean for titles in all
pass
'examplesite' => array(
//Type: torrent|amule|magnets
'type' => 'magnets',
//Title: domain.com
'title' => 'examplesite.biz',
//Pass needed to get torrent/amule, from base page search, 1 pass if torrent/amule in next, 2 if hava second page to link, ...
'passnumber' => 3,
//HTML Code Format: UTF-8, ANSI, ...
'htmlformat' => 'UTF-8',
//Check Duplicates: search if file media title exist and cancel download
'duplicatescheck' => TRUE,
//Title Clean, remove strings from title for duplicates scan
'titleclean' => array(
':',
'1080',
'720',
'1080p',
'720p',
'..',
'...',
),
This zone define de actions to get links for MENU->SCRAPPERWEB
search action.
Define:
- urlsearch: url to add string search and get links results
- urlbase: if not search string links are extracted from this url. This value is used too for detect scrapped used when links are pasted on
MENU->PASTELINKS
comparing this domain with each url pasted. - linksappend: sometimes webs have relative links and its needed to add domain to url extracted
- linksobject: HTML object to get links on page, can be function like
webscrap_extract_links_all_html
to extract all links in pure html, when links not in any object (like in javascript) - linkstitle: if link data, title or alt get a not valid title alternative mode can be used,
inhtml
can search between params and get a valid title orinurl
to extract title from url between params or last part of url - linksimage: can get nearest image (mode
near
) or extract from html between params (modeinhtml
) and show in search list result - linktitleneeded: strings needed in title to be valid link (any of them)
- linkurlneeded: string needed in url to be valid link (any of them)
- linktitleexclude: strings to ban url for valid (any of them)
- linkurlexclude: strings to ban url for valid (any of them)
- filtersizemax: max file size in MB to exclude links if can be extracted in this
pass
(in the search result html) - filtersizetextpre: text before size in html (example
<span id="size">10GB</span>
is<span id="size">
). Size type in text its autodetected and needed (MB,GB,KB) - filtersizetextpos: text after size in html (example
<span id="size">10GB</span>
is</span>
). Size type in text its autodetected and needed (MB,GB,KB) - filtersizetextdistance: max distance in chars to the link to check size values, can be positive (after link) or negative (before link) and discard coincidences without size ident needed (MB,GB,KB)
- filtersizefunction: especific function name to extract size, params passed is all HTML (
functioname($allhtml)
)
When search or explore action sended this part return array of titles=>urls to be passed to firs pass
of config and checked, if valid run actions on pass
and return more links or add item to downloads, if pass
is invalid try with next pass
to the end of pass
configured.
//Search Data In Web
'searchdata' => array(
//Web URL to search: 'http://examplesite.biz/?s='
'urlsearch' => 'https://www.examplesite.biz/?s=',
//Web URL to baselist: 'examplesite.com/?q='
'urlbase' => 'https://www.examplesite.biz/',
//URL Append to links: add to links for incomplete URLs: domain.com/
'linksappend' => 'https://www.examplesite.biz',
//html object have links: a or function
'linksobject' => 'a',
//alternative mode to get title
'linkstitle' => array( 'mode', param1, param2, ... ),
//Modes (one of them)
'linkstitle' => array( 'inhtml', (-/+)sizetosearch, pre-text, posttext ),
'linkstitle' => array( 'inurl', pre-text, posttext )
//Extract image from search and use on list
'linksimage' => array( 'mode', param1, param2, ... ),
//Modes (one of them)
'linksimage' => array( 'inhtml', (-/+)sizetosearch, pre-text, posttext ),
'linksimage' => array( 'near' ),
//String needed in linkTitle to be valid
'linktitleneeded' => array(),
//String needed in linkURL to be valid
'linkurlneeded' => array(
'/series/',
'/movies/',
'/season/',
),
//String Exclude in linkTitle to be valid
'linktitleexclude' => array(
'4K',
'v.o.s.e',
),
//String Exclude in linkURL to be valid
'linkurlexclude' => array(
'v.o.s.e',
),
//FILTER SIZE
//Max File Size: 0 disabled|X megabytes
'filtersizemax' => 2500,
//FILTER SIZE: get size between text: textpre + XX.XX Gb|Mb + textpos
'filtersizetextpre' => '<span class="size">',
'filtersizetextpos' => '</span>',
//FILTER SIZE: max distance from link (+:fordward, -:backward)
'filtersizetextdistance' => 1000,
//FILTER SIZE: especific size(MB)=function( $html )
'filtersizefunction' => '',
),
Each pass
is a posible url to be scanned to the download link. In the example web have:
- frontal page (when search if empty return frontal page)
- page with movies and magnet in this page
- page with series and link to the season of the serie
- page with season and link to episode of the serie
- page with episode and magnet in this page
Options are like search zone with some adds:
- urlvalid: a link sended to this
pass
need to have this string in url to be valid - passnext: if link is for this pass valid links are sended to
passnext
pass
, can be FALSE (this is the lastpass
and with downloadtype
or especifieddownloadfunction
action its taken) or Numeric setting nextpass
to send valid links - downloadmultiple: set to look and send all links valid (TRUE) or first link valid (FALSE)
- downloadfunction: name of the function to take action of the valid link (
functioname($link)
)
- load html of
urlbase
- extract all links in object
a
or function name extractor - check valid links including and excluding
- clean titles and check duplicated
- return valid links to selector
- load html of
urlsearch+searchstring
- extract all links in object
a
or function name extractor - check valid links including and excluding
- clean titles and check duplicated
- return valid links to selector
- check link pass valid with
urlvalid
(need to contain this string), if not valid send to nextpass
- if
passnext
is FALSE try to check for valid function indownloadfunction
or send totype
and finish - if
passnext
is valid numericpass
- extract all links in object
a
or function name extractor - check valid links including and excluding
- clean titles and check duplicated
- send to
passnext
and start thatpass
With a movie link (/movie/
):
- Check pass 0
urlvalid
(/movie/
) and extract alla
links that have as titlelinktitleneeded
and in urllinkurlneeded
- send to pass 4 as configure in
passnext
-
pass
4 havepassnext=FALSE
, check for download function and not have, then checktype
and send link to appropiate function for magnets.
With a serie link (/serie/
):
- Check pass 0
urlvalid
(/movie/
) and discard, send to nextpass
- Check pass 1
urlvalid
(/serie/
) and extract alla
links that have as titlelinktitleneeded
and in urllinkurlneeded
and send to pass 2 indicated inpassnext
(filtered only passed/season/
links) - Check pass 2
urlvalid
(/season/
) and extract alla
links that have as titlelinktitleneeded
and in urllinkurlneeded
and send to pass 3 indicated inpassnext
(filtered only passed/episode/
links), all the links like configured indownloadmultiple
because/season/
have all links for/episodes/
- Check pass 3
urlvalid
(/episode/
) and extract alla
links that have as titlelinktitleneeded
and in urllinkurlneeded
and send to pass 4 indicated inpassnext
(filtered only passedmagnet:
links) - Check pass 4
urlvalid
(magnet:
), thatpass
havepassnext=FALSE
, check for download function and not have, then checktype
and send link to appropiate function for magnets.