Skip to content

Commit

Permalink
v1.1.0
Browse files Browse the repository at this point in the history
  • Loading branch information
tengfei-xy committed Feb 2, 2024
1 parent 81c64f2 commit 0260e87
Show file tree
Hide file tree
Showing 9 changed files with 75 additions and 47 deletions.
14 changes: 12 additions & 2 deletions README.MD
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,24 @@

## 数据库

仅支持mysql,DDL代码参考[文件](https://github.com/tengfei-xy/amazon-crawler/blob/main/sql/ddl.sql)由于工具需要关键词来进行搜索,所以需要初始化一些关键词,参考[关键词数据库文件]((https://github.com/tengfei-xy/amazon-crawler/blob/main/sql/category.sql))
仅支持mysql,DDL代码参考[文件](https://github.com/tengfei-xy/amazon-crawler/blob/main/sql/ddl.sql)

导入文件并初始化参考命令
导入文件并初始化数据库参考命令

```
bin/mysql -u root -p < xxx.sql
```

由于工具需要关键词来进行搜索,所以需要初始化一些关键词,参考[关键词数据库文件]((https://github.com/tengfei-xy/amazon-crawler/blob/main/sql/category.sql))

导入方式

```
bin/mysql -D amazon -u root -p < xxx.sql
```



## 配置文件

复制config.yaml.save为config.yaml
Expand Down
6 changes: 6 additions & 0 deletions config.yaml.save
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,13 @@ basic:
# 测试模式,将不连接数据库
test: false

# 填写亚马逊的域名,比如 www.amazon.co.uk,www.amazon.com
# 建议填写三级域名
domain: "www.amazon.com"

proxy:
# 设置是否启动代理
enable: true
socks5:
# socks5必须使用,目前仅仅支持一个
# 启动socks代理,可以尝试安装gost
Expand Down
14 changes: 7 additions & 7 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ import (
"gopkg.in/yaml.v3"
)

const AMAZON_UK = "https://www.amazon.co.uk"
const MYSQL_APPLICATION_STATUS_START int = 0
const MYSQL_APPLICATION_STATUS_OVER int = 1
const MYSQL_APPLICATION_STATUS_SEARCH int = 2
Expand All @@ -40,11 +39,14 @@ type Enable struct {
Trn bool `yaml:"trn"`
}
type Basic struct {
App_id int `yaml:"app_id"`
Host_id int `yaml:"host_id"`
Test bool `yaml:"test"`
App_id int `yaml:"app_id"`
Host_id int `yaml:"host_id"`
Test bool `yaml:"test"`
Domain string `yaml:"domain"`
}
type Proxy struct {
Enable bool `yaml:"enable"`

Sockc5 []string `yaml:"socks5"`
}
type Mysql struct {
Expand All @@ -56,7 +58,6 @@ type Mysql struct {
}
type flagStruct struct {
config_file string
web bool
}

var app appConfig
Expand All @@ -70,7 +71,7 @@ func init_config(flag flagStruct) {
if err != nil {
panic(err)
}
if !app.Enable.Search && !app.Enable.Seller && !app.Enable.Trn {
if !app.Exec.Enable.Search && !app.Exec.Enable.Seller && !app.Exec.Enable.Trn {
panic("没有启动功能,检查配置文件的enable配置的选项")
}
log.Infof("程序标识:%d 主机标识:%d", app.Basic.App_id, app.Basic.Host_id)
Expand Down Expand Up @@ -119,7 +120,6 @@ func init_signal() {
func init_flag() flagStruct {
var f flagStruct
flag.StringVar(&f.config_file, "c", "config.yaml", "打开配置文件")
flag.BoolVar(&f.web, "web", false, "启动web")
flag.Parse()
return f
}
Expand Down
26 changes: 20 additions & 6 deletions proxy.go
Original file line number Diff line number Diff line change
@@ -1,29 +1,43 @@
package main

import (
"fmt"
"math/rand"
"net"
"net/http"
"time"

"golang.org/x/net/proxy"
)

func rangdom_range(max int) int {
rand.NewSource(time.Now().UnixNano())
return rand.Intn(max)
}
func get_socks5_proxy() (proxy.Dialer, error) {
// 创建一个SOCKS5代理拨号器
return proxy.SOCKS5("tcp", app.Proxy.Sockc5[0], nil, proxy.Direct)
len := len(app.Proxy.Sockc5)
if len == 0 {
return nil, fmt.Errorf("没有可用的代理")
}
return proxy.SOCKS5("tcp", app.Proxy.Sockc5[rangdom_range(len)], nil, proxy.Direct)
}
func get_client() http.Client {

proxy, err := get_socks5_proxy()
if err != nil {
return http.Client{Timeout: time.Second * 60}
}
return http.Client{
Transport: &http.Transport{
Dial: proxy.Dial,
},
if app.Proxy.Enable {
return http.Client{
Transport: &http.Transport{
Dial: proxy.Dial,
},

Timeout: time.Second * 60,
Timeout: time.Second * 60,
}
} else {
return http.Client{Timeout: time.Second * 60}
}
}

Expand Down
2 changes: 1 addition & 1 deletion search.go
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ func (s *search) request(seq int) (*goquery.Document, error) {
// -H 'viewport-width: 2028' \
// --compressed

url := fmt.Sprintf("https://www.amazon.co.uk/s?k=%s&page=%d&crid=2V9436DZJ6IJF&qid=1699839233&sprefix=clothe%%2Caps%%2C552&ref=sr_pg_2", s.en_key, seq)
url := fmt.Sprintf("https://%s/s?k=%s&page=%d&crid=2V9436DZJ6IJF&qid=1699839233&sprefix=clothe%%2Caps%%2C552&ref=sr_pg_2", app.Domain, s.en_key, seq)
log.Infof("开始搜索 关键词:%s 页面:%d url:%s", s.zh_key, seq, url)

client := get_client()
Expand Down
6 changes: 3 additions & 3 deletions seller.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ func (seller *sellerStruct) main() error {
continue
}

url = AMAZON_UK + url + param
url = app.Domain + url + param
log.Infof("查找商家链接 ID:%d url:%s", primary_id, url)
err := seller.request(url)
if err != nil {
Expand Down Expand Up @@ -112,7 +112,7 @@ func (seller *sellerStruct) request(url string) error {
if err != nil {
return err
}
req.Header.Set("Authority", `www.amazon.co.uk`)
req.Header.Set("Authority", app.Domain)
req.Header.Set("Accept", `text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7`)
req.Header.Set("Accept-Language", `zh-CN,zh;q=0.9`)
req.Header.Set("cache-control", `max-age=0`)
Expand All @@ -127,7 +127,7 @@ func (seller *sellerStruct) request(url string) error {
req.Header.Set("Cookie", app.cookie)
}
req.Header.Set("upgrade-insecure-requests", `1`)
req.Header.Set("Referer", "https://www.amazon.co.uk/s?k=Hardware+electricia%27n&crid=3CR8DCX0B3L5U&sprefix=hardware+electricia%27n%2Caps%2C714&ref=nb_sb_noss")
req.Header.Set("Referer", fmt.Sprintf("https://%s/?k=Hardware+electricia%%27n&crid=3CR8DCX0B3L5U&sprefix=hardware+electricia%%27n%%2Caps%%2C714&ref=nb_sb_noss", app.Domain))
req.Header.Set("Sec-Fetch-Dest", `empty`)
req.Header.Set("Sec-Fetch-Mode", `cors`)
req.Header.Set("Sec-Fetch-Site", `same-origin`)
Expand Down
4 changes: 1 addition & 3 deletions sql/category.sql
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
[tengfei@core ~]$ cat cateory.sql
-- MySQL dump 10.13 Distrib 5.7.35, for Linux (x86_64)
--
-- Host: localhost Database: amazon
Expand Down Expand Up @@ -51,5 +50,4 @@ UNLOCK TABLES;
/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
/*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;

-- Dump completed on 2023-11-21 10:42:28
[tengfei@core ~]$
-- Dump completed on 2023-11-21 10:42:28
44 changes: 22 additions & 22 deletions sql/ddl.sql
Original file line number Diff line number Diff line change
Expand Up @@ -37,29 +37,29 @@ SET character_set_client = utf8;
SET character_set_client = @saved_cs_client;

--
-- Temporary table structure for view `公司信息表`
-- Temporary table structure for view `占用空间表`
--

DROP TABLE IF EXISTS `公司信息表`;
/*!50001 DROP VIEW IF EXISTS `公司信息表`*/;
DROP TABLE IF EXISTS `占用空间表`;
/*!50001 DROP VIEW IF EXISTS `占用空间表`*/;
SET @saved_cs_client = @@character_set_client;
SET character_set_client = utf8;
/*!50001 CREATE VIEW `公司信息表` AS SELECT
1 AS `数量`,
1 AS `count(*)`*/;
/*!50001 CREATE VIEW `占用空间表` AS SELECT
1 AS `Database`,
1 AS `Size (MB)`*/;
SET character_set_client = @saved_cs_client;

--
-- Temporary table structure for view `占用空间表`
-- Temporary table structure for view `商家查找表`
--

DROP TABLE IF EXISTS `占用空间表`;
/*!50001 DROP VIEW IF EXISTS `占用空间表`*/;
DROP TABLE IF EXISTS `商家查找表`;
/*!50001 DROP VIEW IF EXISTS `商家查找表`*/;
SET @saved_cs_client = @@character_set_client;
SET character_set_client = utf8;
/*!50001 CREATE VIEW `占用空间表` AS SELECT
1 AS `Database`,
1 AS `Size (MB)`*/;
/*!50001 CREATE VIEW `商家查找表` AS SELECT
1 AS `数量`,
1 AS `count(*)`*/;
SET character_set_client = @saved_cs_client;

--
Expand Down Expand Up @@ -115,7 +115,7 @@ CREATE TABLE `application` (
`status` tinyint(1) NOT NULL DEFAULT '0',
`update` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=82 DEFAULT CHARSET=utf8mb4;
) ENGINE=InnoDB AUTO_INCREMENT=125 DEFAULT CHARSET=utf8mb4;
/*!40101 SET character_set_client = @saved_cs_client */;

--
Expand All @@ -131,7 +131,7 @@ CREATE TABLE `category` (
`en_key` varchar(50) NOT NULL,
`priority` int(11) DEFAULT '0',
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=1014 DEFAULT CHARSET=utf8;
) ENGINE=InnoDB AUTO_INCREMENT=1015 DEFAULT CHARSET=utf8;
/*!40101 SET character_set_client = @saved_cs_client */;

--
Expand Down Expand Up @@ -204,7 +204,7 @@ CREATE TABLE `seller` (
`company_id` char(16) DEFAULT NULL,
PRIMARY KEY (`id`),
UNIQUE KEY `seller_id_UNIQUE` (`seller_id`)
) ENGINE=InnoDB AUTO_INCREMENT=100739 DEFAULT CHARSET=utf8;
) ENGINE=InnoDB AUTO_INCREMENT=115613 DEFAULT CHARSET=utf8;
/*!40101 SET character_set_client = @saved_cs_client */;

--
Expand Down Expand Up @@ -232,10 +232,10 @@ USE `amazon`;
/*!50001 SET collation_connection = @saved_col_connection */;

--
-- Final view structure for view `公司信息表`
-- Final view structure for view `占用空间表`
--

/*!50001 DROP VIEW IF EXISTS `公司信息表`*/;
/*!50001 DROP VIEW IF EXISTS `占用空间表`*/;
/*!50001 SET @saved_cs_client = @@character_set_client */;
/*!50001 SET @saved_cs_results = @@character_set_results */;
/*!50001 SET @saved_col_connection = @@collation_connection */;
Expand All @@ -244,16 +244,16 @@ USE `amazon`;
/*!50001 SET collation_connection = utf8mb4_general_ci */;
/*!50001 CREATE ALGORITHM=UNDEFINED */
/*!50013 DEFINER=`amazon`@`%` SQL SECURITY DEFINER */
/*!50001 VIEW `公司信息表` AS select (case `seller`.`info_status` when 0 then '没查找' when 1 then '公司ID' when 2 then '已完整' when 3 then '没有信息' when 4 then '多个信息' end) AS `数量`,count(0) AS `count(*)` from `seller` where (`seller`.`status` = 1) group by `seller`.`info_status` */;
/*!50001 VIEW `占用空间表` AS select `information_schema`.`tables`.`TABLE_SCHEMA` AS `Database`,((sum((`information_schema`.`tables`.`DATA_LENGTH` + `information_schema`.`tables`.`INDEX_LENGTH`)) / 1024) / 1024) AS `Size (MB)` from `information_schema`.`tables` group by `information_schema`.`tables`.`TABLE_SCHEMA` */;
/*!50001 SET character_set_client = @saved_cs_client */;
/*!50001 SET character_set_results = @saved_cs_results */;
/*!50001 SET collation_connection = @saved_col_connection */;

--
-- Final view structure for view `占用空间表`
-- Final view structure for view `商家查找表`
--

/*!50001 DROP VIEW IF EXISTS `占用空间表`*/;
/*!50001 DROP VIEW IF EXISTS `商家查找表`*/;
/*!50001 SET @saved_cs_client = @@character_set_client */;
/*!50001 SET @saved_cs_results = @@character_set_results */;
/*!50001 SET @saved_col_connection = @@collation_connection */;
Expand All @@ -262,7 +262,7 @@ USE `amazon`;
/*!50001 SET collation_connection = utf8mb4_general_ci */;
/*!50001 CREATE ALGORITHM=UNDEFINED */
/*!50013 DEFINER=`amazon`@`%` SQL SECURITY DEFINER */
/*!50001 VIEW `占用空间表` AS select `information_schema`.`tables`.`TABLE_SCHEMA` AS `Database`,((sum((`information_schema`.`tables`.`DATA_LENGTH` + `information_schema`.`tables`.`INDEX_LENGTH`)) / 1024) / 1024) AS `Size (MB)` from `information_schema`.`tables` group by `information_schema`.`tables`.`TABLE_SCHEMA` */;
/*!50001 VIEW `商家查找表` AS select (case `seller`.`status` when 0 then '未查找' when 1 then '中国ID' when 2 then '空ID' when 3 then '其他ID' when 4 then '异常ID' end) AS `数量`,count(0) AS `count(*)` from `seller` group by `seller`.`status` */;
/*!50001 SET character_set_client = @saved_cs_client */;
/*!50001 SET character_set_results = @saved_cs_results */;
/*!50001 SET collation_connection = @saved_col_connection */;
Expand Down Expand Up @@ -330,4 +330,4 @@ USE `amazon`;
/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
/*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;

-- Dump completed on 2023-11-21 10:37:17
-- Dump completed on 2024-02-02 19:05:53
6 changes: 3 additions & 3 deletions trn.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ func (trn *trnStruct) main() error {
//
// 找到 91440101MA9Y624U3K
func (trn *trnStruct) request() error {
trn.url = fmt.Sprintf("%s/sp?ie=UTF8&seller=%s", AMAZON_UK, trn.seller_id)
trn.url = fmt.Sprintf("%s/sp?ie=UTF8&seller=%s", app.Domain, trn.seller_id)

log.Infof("查找TRN 链接: %s", trn.url)

Expand All @@ -117,7 +117,7 @@ func (trn *trnStruct) request() error {
if err != nil {
return err
}
req.Header.Set("Authority", `www.amazon.co.uk`)
req.Header.Set("Authority", app.Domain)
req.Header.Set("Accept", `text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7`)
req.Header.Set("Accept-Language", `zh-CN,zh;q=0.9`)
req.Header.Set("cache-control", `max-age=0`)
Expand All @@ -132,7 +132,7 @@ func (trn *trnStruct) request() error {
req.Header.Set("Cookie", app.cookie)
}
req.Header.Set("upgrade-insecure-requests", `1`)
req.Header.Set("Referer", "https://www.amazon.co.uk/s?k=Hardware+electricia%27n&crid=3CR8DCX0B3L5U&sprefix=hardware+electricia%27n%2Caps%2C714&ref=nb_sb_noss")
req.Header.Set("Referer", fmt.Sprintf("https://%s/?k=Hardware+electricia%%27n&crid=3CR8DCX0B3L5U&sprefix=hardware+electricia%%27n%%2Caps%%2C714&ref=nb_sb_noss", app.Domain))
req.Header.Set("Sec-Fetch-Dest", `empty`)
req.Header.Set("Sec-Fetch-Mode", `cors`)
req.Header.Set("Sec-Fetch-Site", `same-origin`)
Expand Down

0 comments on commit 0260e87

Please sign in to comment.