diff --git a/README.MD b/README.MD index fc933c6..3cf8bd0 100644 --- a/README.MD +++ b/README.MD @@ -12,14 +12,24 @@ ## 数据库 -仅支持mysql,DDL代码参考[文件](https://github.com/tengfei-xy/amazon-crawler/blob/main/sql/ddl.sql),由于工具需要关键词来进行搜索,所以需要初始化一些关键词,参考[关键词数据库文件]((https://github.com/tengfei-xy/amazon-crawler/blob/main/sql/category.sql)) +仅支持mysql,DDL代码参考[文件](https://github.com/tengfei-xy/amazon-crawler/blob/main/sql/ddl.sql), -导入文件并初始化参考命令: +导入文件并初始化数据库参考命令: ``` bin/mysql -u root -p < xxx.sql ``` +由于工具需要关键词来进行搜索,所以需要初始化一些关键词,参考[关键词数据库文件]((https://github.com/tengfei-xy/amazon-crawler/blob/main/sql/category.sql)) + +导入方式 + +``` +bin/mysql -D amazon -u root -p < xxx.sql +``` + + + ## 配置文件 复制config.yaml.save为config.yaml diff --git a/config.yaml.save b/config.yaml.save index 5402499..1e4fb2e 100644 --- a/config.yaml.save +++ b/config.yaml.save @@ -10,7 +10,13 @@ basic: # 测试模式,将不连接数据库 test: false + # 填写亚马逊的域名,比如 www.amazon.co.uk,www.amazon.com + # 建议填写三级域名 + domain: "www.amazon.com" + proxy: + # 设置是否启动代理 + enable: true socks5: # socks5必须使用,目前仅仅支持一个 # 启动socks代理,可以尝试安装gost diff --git a/main.go b/main.go index 473b6af..debfd11 100644 --- a/main.go +++ b/main.go @@ -14,7 +14,6 @@ import ( "gopkg.in/yaml.v3" ) -const AMAZON_UK = "https://www.amazon.co.uk" const MYSQL_APPLICATION_STATUS_START int = 0 const MYSQL_APPLICATION_STATUS_OVER int = 1 const MYSQL_APPLICATION_STATUS_SEARCH int = 2 @@ -40,11 +39,14 @@ type Enable struct { Trn bool `yaml:"trn"` } type Basic struct { - App_id int `yaml:"app_id"` - Host_id int `yaml:"host_id"` - Test bool `yaml:"test"` + App_id int `yaml:"app_id"` + Host_id int `yaml:"host_id"` + Test bool `yaml:"test"` + Domain string `yaml:"domain"` } type Proxy struct { + Enable bool `yaml:"enable"` + Sockc5 []string `yaml:"socks5"` } type Mysql struct { @@ -56,7 +58,6 @@ type Mysql struct { } type flagStruct struct { config_file string - web bool } var app appConfig @@ -70,7 +71,7 @@ func init_config(flag flagStruct) { if err != nil { panic(err) } - if !app.Enable.Search && !app.Enable.Seller && !app.Enable.Trn { + if !app.Exec.Enable.Search && !app.Exec.Enable.Seller && !app.Exec.Enable.Trn { panic("没有启动功能,检查配置文件的enable配置的选项") } log.Infof("程序标识:%d 主机标识:%d", app.Basic.App_id, app.Basic.Host_id) @@ -119,7 +120,6 @@ func init_signal() { func init_flag() flagStruct { var f flagStruct flag.StringVar(&f.config_file, "c", "config.yaml", "打开配置文件") - flag.BoolVar(&f.web, "web", false, "启动web") flag.Parse() return f } diff --git a/proxy.go b/proxy.go index d7ca024..533a527 100644 --- a/proxy.go +++ b/proxy.go @@ -1,6 +1,8 @@ package main import ( + "fmt" + "math/rand" "net" "net/http" "time" @@ -8,9 +10,17 @@ import ( "golang.org/x/net/proxy" ) +func rangdom_range(max int) int { + rand.NewSource(time.Now().UnixNano()) + return rand.Intn(max) +} func get_socks5_proxy() (proxy.Dialer, error) { // 创建一个SOCKS5代理拨号器 - return proxy.SOCKS5("tcp", app.Proxy.Sockc5[0], nil, proxy.Direct) + len := len(app.Proxy.Sockc5) + if len == 0 { + return nil, fmt.Errorf("没有可用的代理") + } + return proxy.SOCKS5("tcp", app.Proxy.Sockc5[rangdom_range(len)], nil, proxy.Direct) } func get_client() http.Client { @@ -18,12 +28,16 @@ func get_client() http.Client { if err != nil { return http.Client{Timeout: time.Second * 60} } - return http.Client{ - Transport: &http.Transport{ - Dial: proxy.Dial, - }, + if app.Proxy.Enable { + return http.Client{ + Transport: &http.Transport{ + Dial: proxy.Dial, + }, - Timeout: time.Second * 60, + Timeout: time.Second * 60, + } + } else { + return http.Client{Timeout: time.Second * 60} } } diff --git a/search.go b/search.go index ed0b305..eb76d15 100644 --- a/search.go +++ b/search.go @@ -142,7 +142,7 @@ func (s *search) request(seq int) (*goquery.Document, error) { // -H 'viewport-width: 2028' \ // --compressed - url := fmt.Sprintf("https://www.amazon.co.uk/s?k=%s&page=%d&crid=2V9436DZJ6IJF&qid=1699839233&sprefix=clothe%%2Caps%%2C552&ref=sr_pg_2", s.en_key, seq) + url := fmt.Sprintf("https://%s/s?k=%s&page=%d&crid=2V9436DZJ6IJF&qid=1699839233&sprefix=clothe%%2Caps%%2C552&ref=sr_pg_2", app.Domain, s.en_key, seq) log.Infof("开始搜索 关键词:%s 页面:%d url:%s", s.zh_key, seq, url) client := get_client() diff --git a/seller.go b/seller.go index 6631ef0..7118e74 100644 --- a/seller.go +++ b/seller.go @@ -50,7 +50,7 @@ func (seller *sellerStruct) main() error { continue } - url = AMAZON_UK + url + param + url = app.Domain + url + param log.Infof("查找商家链接 ID:%d url:%s", primary_id, url) err := seller.request(url) if err != nil { @@ -112,7 +112,7 @@ func (seller *sellerStruct) request(url string) error { if err != nil { return err } - req.Header.Set("Authority", `www.amazon.co.uk`) + req.Header.Set("Authority", app.Domain) req.Header.Set("Accept", `text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7`) req.Header.Set("Accept-Language", `zh-CN,zh;q=0.9`) req.Header.Set("cache-control", `max-age=0`) @@ -127,7 +127,7 @@ func (seller *sellerStruct) request(url string) error { req.Header.Set("Cookie", app.cookie) } req.Header.Set("upgrade-insecure-requests", `1`) - req.Header.Set("Referer", "https://www.amazon.co.uk/s?k=Hardware+electricia%27n&crid=3CR8DCX0B3L5U&sprefix=hardware+electricia%27n%2Caps%2C714&ref=nb_sb_noss") + req.Header.Set("Referer", fmt.Sprintf("https://%s/?k=Hardware+electricia%%27n&crid=3CR8DCX0B3L5U&sprefix=hardware+electricia%%27n%%2Caps%%2C714&ref=nb_sb_noss", app.Domain)) req.Header.Set("Sec-Fetch-Dest", `empty`) req.Header.Set("Sec-Fetch-Mode", `cors`) req.Header.Set("Sec-Fetch-Site", `same-origin`) diff --git a/sql/category.sql b/sql/category.sql index d83e62d..c6852f7 100644 --- a/sql/category.sql +++ b/sql/category.sql @@ -1,4 +1,3 @@ -[tengfei@core ~]$ cat cateory.sql -- MySQL dump 10.13 Distrib 5.7.35, for Linux (x86_64) -- -- Host: localhost Database: amazon @@ -51,5 +50,4 @@ UNLOCK TABLES; /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; --- Dump completed on 2023-11-21 10:42:28 -[tengfei@core ~]$ +-- Dump completed on 2023-11-21 10:42:28 \ No newline at end of file diff --git a/sql/ddl.sql b/sql/ddl.sql index 0ff0957..fb090ef 100644 --- a/sql/ddl.sql +++ b/sql/ddl.sql @@ -37,29 +37,29 @@ SET character_set_client = utf8; SET character_set_client = @saved_cs_client; -- --- Temporary table structure for view `公司信息表` +-- Temporary table structure for view `占用空间表` -- -DROP TABLE IF EXISTS `公司信息表`; -/*!50001 DROP VIEW IF EXISTS `公司信息表`*/; +DROP TABLE IF EXISTS `占用空间表`; +/*!50001 DROP VIEW IF EXISTS `占用空间表`*/; SET @saved_cs_client = @@character_set_client; SET character_set_client = utf8; -/*!50001 CREATE VIEW `公司信息表` AS SELECT - 1 AS `数量`, - 1 AS `count(*)`*/; +/*!50001 CREATE VIEW `占用空间表` AS SELECT + 1 AS `Database`, + 1 AS `Size (MB)`*/; SET character_set_client = @saved_cs_client; -- --- Temporary table structure for view `占用空间表` +-- Temporary table structure for view `商家查找表` -- -DROP TABLE IF EXISTS `占用空间表`; -/*!50001 DROP VIEW IF EXISTS `占用空间表`*/; +DROP TABLE IF EXISTS `商家查找表`; +/*!50001 DROP VIEW IF EXISTS `商家查找表`*/; SET @saved_cs_client = @@character_set_client; SET character_set_client = utf8; -/*!50001 CREATE VIEW `占用空间表` AS SELECT - 1 AS `Database`, - 1 AS `Size (MB)`*/; +/*!50001 CREATE VIEW `商家查找表` AS SELECT + 1 AS `数量`, + 1 AS `count(*)`*/; SET character_set_client = @saved_cs_client; -- @@ -115,7 +115,7 @@ CREATE TABLE `application` ( `status` tinyint(1) NOT NULL DEFAULT '0', `update` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, PRIMARY KEY (`id`) -) ENGINE=InnoDB AUTO_INCREMENT=82 DEFAULT CHARSET=utf8mb4; +) ENGINE=InnoDB AUTO_INCREMENT=125 DEFAULT CHARSET=utf8mb4; /*!40101 SET character_set_client = @saved_cs_client */; -- @@ -131,7 +131,7 @@ CREATE TABLE `category` ( `en_key` varchar(50) NOT NULL, `priority` int(11) DEFAULT '0', PRIMARY KEY (`id`) -) ENGINE=InnoDB AUTO_INCREMENT=1014 DEFAULT CHARSET=utf8; +) ENGINE=InnoDB AUTO_INCREMENT=1015 DEFAULT CHARSET=utf8; /*!40101 SET character_set_client = @saved_cs_client */; -- @@ -204,7 +204,7 @@ CREATE TABLE `seller` ( `company_id` char(16) DEFAULT NULL, PRIMARY KEY (`id`), UNIQUE KEY `seller_id_UNIQUE` (`seller_id`) -) ENGINE=InnoDB AUTO_INCREMENT=100739 DEFAULT CHARSET=utf8; +) ENGINE=InnoDB AUTO_INCREMENT=115613 DEFAULT CHARSET=utf8; /*!40101 SET character_set_client = @saved_cs_client */; -- @@ -232,10 +232,10 @@ USE `amazon`; /*!50001 SET collation_connection = @saved_col_connection */; -- --- Final view structure for view `公司信息表` +-- Final view structure for view `占用空间表` -- -/*!50001 DROP VIEW IF EXISTS `公司信息表`*/; +/*!50001 DROP VIEW IF EXISTS `占用空间表`*/; /*!50001 SET @saved_cs_client = @@character_set_client */; /*!50001 SET @saved_cs_results = @@character_set_results */; /*!50001 SET @saved_col_connection = @@collation_connection */; @@ -244,16 +244,16 @@ USE `amazon`; /*!50001 SET collation_connection = utf8mb4_general_ci */; /*!50001 CREATE ALGORITHM=UNDEFINED */ /*!50013 DEFINER=`amazon`@`%` SQL SECURITY DEFINER */ -/*!50001 VIEW `公司信息表` AS select (case `seller`.`info_status` when 0 then '没查找' when 1 then '公司ID' when 2 then '已完整' when 3 then '没有信息' when 4 then '多个信息' end) AS `数量`,count(0) AS `count(*)` from `seller` where (`seller`.`status` = 1) group by `seller`.`info_status` */; +/*!50001 VIEW `占用空间表` AS select `information_schema`.`tables`.`TABLE_SCHEMA` AS `Database`,((sum((`information_schema`.`tables`.`DATA_LENGTH` + `information_schema`.`tables`.`INDEX_LENGTH`)) / 1024) / 1024) AS `Size (MB)` from `information_schema`.`tables` group by `information_schema`.`tables`.`TABLE_SCHEMA` */; /*!50001 SET character_set_client = @saved_cs_client */; /*!50001 SET character_set_results = @saved_cs_results */; /*!50001 SET collation_connection = @saved_col_connection */; -- --- Final view structure for view `占用空间表` +-- Final view structure for view `商家查找表` -- -/*!50001 DROP VIEW IF EXISTS `占用空间表`*/; +/*!50001 DROP VIEW IF EXISTS `商家查找表`*/; /*!50001 SET @saved_cs_client = @@character_set_client */; /*!50001 SET @saved_cs_results = @@character_set_results */; /*!50001 SET @saved_col_connection = @@collation_connection */; @@ -262,7 +262,7 @@ USE `amazon`; /*!50001 SET collation_connection = utf8mb4_general_ci */; /*!50001 CREATE ALGORITHM=UNDEFINED */ /*!50013 DEFINER=`amazon`@`%` SQL SECURITY DEFINER */ -/*!50001 VIEW `占用空间表` AS select `information_schema`.`tables`.`TABLE_SCHEMA` AS `Database`,((sum((`information_schema`.`tables`.`DATA_LENGTH` + `information_schema`.`tables`.`INDEX_LENGTH`)) / 1024) / 1024) AS `Size (MB)` from `information_schema`.`tables` group by `information_schema`.`tables`.`TABLE_SCHEMA` */; +/*!50001 VIEW `商家查找表` AS select (case `seller`.`status` when 0 then '未查找' when 1 then '中国ID' when 2 then '空ID' when 3 then '其他ID' when 4 then '异常ID' end) AS `数量`,count(0) AS `count(*)` from `seller` group by `seller`.`status` */; /*!50001 SET character_set_client = @saved_cs_client */; /*!50001 SET character_set_results = @saved_cs_results */; /*!50001 SET collation_connection = @saved_col_connection */; @@ -330,4 +330,4 @@ USE `amazon`; /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; --- Dump completed on 2023-11-21 10:37:17 \ No newline at end of file +-- Dump completed on 2024-02-02 19:05:53 \ No newline at end of file diff --git a/trn.go b/trn.go index 84884f6..36dfb71 100644 --- a/trn.go +++ b/trn.go @@ -94,7 +94,7 @@ func (trn *trnStruct) main() error { // // 找到 91440101MA9Y624U3K func (trn *trnStruct) request() error { - trn.url = fmt.Sprintf("%s/sp?ie=UTF8&seller=%s", AMAZON_UK, trn.seller_id) + trn.url = fmt.Sprintf("%s/sp?ie=UTF8&seller=%s", app.Domain, trn.seller_id) log.Infof("查找TRN 链接: %s", trn.url) @@ -117,7 +117,7 @@ func (trn *trnStruct) request() error { if err != nil { return err } - req.Header.Set("Authority", `www.amazon.co.uk`) + req.Header.Set("Authority", app.Domain) req.Header.Set("Accept", `text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7`) req.Header.Set("Accept-Language", `zh-CN,zh;q=0.9`) req.Header.Set("cache-control", `max-age=0`) @@ -132,7 +132,7 @@ func (trn *trnStruct) request() error { req.Header.Set("Cookie", app.cookie) } req.Header.Set("upgrade-insecure-requests", `1`) - req.Header.Set("Referer", "https://www.amazon.co.uk/s?k=Hardware+electricia%27n&crid=3CR8DCX0B3L5U&sprefix=hardware+electricia%27n%2Caps%2C714&ref=nb_sb_noss") + req.Header.Set("Referer", fmt.Sprintf("https://%s/?k=Hardware+electricia%%27n&crid=3CR8DCX0B3L5U&sprefix=hardware+electricia%%27n%%2Caps%%2C714&ref=nb_sb_noss", app.Domain)) req.Header.Set("Sec-Fetch-Dest", `empty`) req.Header.Set("Sec-Fetch-Mode", `cors`) req.Header.Set("Sec-Fetch-Site", `same-origin`)