Skip to content

Commit

Permalink
v1.1.1 更新商品链接在不同域名下可能获取失败的问题
Browse files Browse the repository at this point in the history
  • Loading branch information
tengfei-xy committed Feb 4, 2024
1 parent 2b62463 commit a41c831
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 34 deletions.
2 changes: 1 addition & 1 deletion README.MD
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ mysql> SELECT * FROM amazon.产品检查表;
5 rows in set (0.10 sec)
```

**问题反馈(微信号):**SXL--LP
**问题反馈(微信号):** SXL--LP



17 changes: 12 additions & 5 deletions config.yaml.save
Original file line number Diff line number Diff line change
@@ -1,10 +1,17 @@
basic:
# 相同主机下的app标识
# 定义: 相同主机下的app标识
# 说明: 由于本程序将爬虫分为三个步骤,即1. 搜索商品,2.搜索商品中的商家链接 3. 搜索商品链接中的TRN号,
# 因此app_id用于相同主机下不同步骤的的程序分工,即在相同主机下多开爬虫客户端
# 用途: 下次运行时,以保证能够继续作业以及保证能够不会和其他程序处理的数据相冲突
# 注意: 此ID的修改是针对程序而言的,但程序运行期间同时也会将app_id记录到数据库
# 另外,建议此ID为全局的标识,例如:A主机启动1、2、3的app_id,那么B主机再启动时,应从4开始
app_id: 1

# 不同相同主机的标识
# 相同主机下host_id应相同,但倘若使用了socks5,请检查cookie本身是否适用于对应的代理服务器
# 主要用来标识相同的主机下应使用相同的cookie,当然,其实不同的cookie也是可以的。
# 定义: 不同主机之间的标识
# 用途: 相同的主机下的爬虫客户端将使用相同的cookie
# 注意: 此ID的修改针对数据库,因此需要对应到数据库的cookie表
# 插入语句: INSERT INTO `cookie` (`host_id`,`cookie`) VALUES (你的host_id,'xxx=xxx');
# 修改语句: update cookie set cookie='xxx=xxx' where host_id=你的host_id;
host_id: 1

# 测试模式,将不连接数据库
Expand All @@ -18,7 +25,7 @@ proxy:
# 设置是否启动代理
enable: true
socks5:
# socks5必须使用,目前仅仅支持一个
# 每次请求时随机使用其中一个
# 启动socks代理,可以尝试安装gost
# gost -L :8080 或 gost -L -L 127.0.0.1:8080
- 127.0.0.1:8080
Expand Down
75 changes: 53 additions & 22 deletions search.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"database/sql"
"fmt"
"net/http"
"net/url"
"strings"

"github.com/PuerkitoBio/goquery"
Expand Down Expand Up @@ -209,31 +210,61 @@ func (s *search) get_product_url(doc *goquery.Document) {

res := doc.Find("div[class~=s-search-results]").First()

res.Find("div[data-index]").Each(func(i int, h *goquery.Selection) {
// 处理找到的 div 元素
link, exist := h.Find("a").First().Attr("href")
if !exist {
return
}
if strings.HasPrefix(link, "/s") || strings.HasPrefix(link, "/gp/") {
return
}
url := strings.Split(link, "/ref=")
_, err := app.db.Exec(`INSERT INTO product(url,param) values(?,?)`, url[0], "/ref="+url[1])
if res.Length() == 0 {
log.Errorf("错误的页面结构 关键词:%s", s.zh_key)
return
}
// len res.Find("div[data-index]")
data_index := res.Find("div[data-index]")
if data_index.Length() == 0 {
log.Errorf("没有找到商品项 关键词:%s", s.zh_key)
return
}
log.Infof("找到商品项数:%d 关键词:%s", data_index.Length(), s.zh_key)

if is_duplicate_entry(err) {
log.Infof("商品已存在 关键词:%s 链接:%s ", s.zh_key, link)
return
}
if err != nil {
log.Errorf("商品插入失败 关键词:%s 链接:%s %v ", s.zh_key, link, err)
return
data_index.Each(func(i int, g *goquery.Selection) {
link, exist := g.Find("a").First().Attr("href")

if exist {
if strings.HasPrefix(link, "/s") || strings.HasPrefix(link, "/gp/") || strings.Contains(link, `javascript:void(0)`) {
link = fmt.Sprintf("https://%s%s", app.Domain, link)
log.Errorf("不是预设的商品链接,可能需要验证cookie 关键词:%s 具体链接:%s", s.zh_key, link)
} else if strings.Contains(link, `%2Fdp%2F`) {
// 解码
link, _ = url.QueryUnescape(link)
// 从/dp/开始截取
link = "/dp/" + strings.Split(link, "/dp/")[1]

}
if strings.Contains(link, `/dp/`) {
link = "/dp/" + strings.Split(link, "/dp/")[1]
}
s.deal_prouct_url(link)

} else {
link = fmt.Sprintf("https://%s%s", app.Domain, link)
log.Errorf("此商品项中未找到链接 关键词:%s 商品链接:%s 页面商品序号:%d", s.zh_key, link, i)
}

log.Infof("商品插入成功 关键词:%s 链接:%s ", s.zh_key, link)
s.valid += 1
})
}
func (s *search) deal_prouct_url(link string) {
url := strings.Split(link, "/ref=")
// product_id :=
// product_param :=
// log.Infof("找到商品 关键词:%s 链接:%s 商品ID的url:%s 商品参数的url:%s ", s.zh_key, link, url[0], product_param)
_, err := app.db.Exec(`INSERT INTO product(url,param) values(?,?)`, url[0], "/ref="+url[1])

link = fmt.Sprintf("https://%s%s", app.Domain, link)
if is_duplicate_entry(err) {
log.Infof("商品已存在 关键词:%s 链接:%s ", s.zh_key, link)
return
})
return
}
if err != nil {
log.Errorf("商品插入失败 关键词:%s 链接:%s %v ", s.zh_key, link, err)
return
}

log.Infof("商品插入成功 关键词:%s 链接:%s ", s.zh_key, link)
s.valid += 1
}
12 changes: 6 additions & 6 deletions sql/ddl.sql
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ CREATE TABLE `application` (
`status` tinyint(1) NOT NULL DEFAULT '0',
`update` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=125 DEFAULT CHARSET=utf8mb4;
) ENGINE=InnoDB AUTO_INCREMENT=143 DEFAULT CHARSET=utf8mb4;
/*!40101 SET character_set_client = @saved_cs_client */;

--
Expand All @@ -131,7 +131,7 @@ CREATE TABLE `category` (
`en_key` varchar(50) NOT NULL,
`priority` int(11) DEFAULT '0',
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=1015 DEFAULT CHARSET=utf8;
) ENGINE=InnoDB AUTO_INCREMENT=1016 DEFAULT CHARSET=utf8;
/*!40101 SET character_set_client = @saved_cs_client */;

--
Expand All @@ -158,12 +158,12 @@ DROP TABLE IF EXISTS `product`;
CREATE TABLE `product` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`url` varchar(200) NOT NULL,
`param` varchar(150) NOT NULL,
`param` varchar(400) NOT NULL,
`status` tinyint(1) DEFAULT '0',
`app` tinyint(1) NOT NULL DEFAULT '0',
PRIMARY KEY (`id`),
UNIQUE KEY `url` (`url`)
) ENGINE=InnoDB AUTO_INCREMENT=461455 DEFAULT CHARSET=utf8;
) ENGINE=InnoDB AUTO_INCREMENT=461937 DEFAULT CHARSET=utf8;
/*!40101 SET character_set_client = @saved_cs_client */;

--
Expand All @@ -184,7 +184,7 @@ CREATE TABLE `search_statistics` (
PRIMARY KEY (`id`),
KEY `category_id` (`category_id`),
CONSTRAINT `search_statistics_ibfk_1` FOREIGN KEY (`category_id`) REFERENCES `category` (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=2307 DEFAULT CHARSET=utf8;
) ENGINE=InnoDB AUTO_INCREMENT=2330 DEFAULT CHARSET=utf8;
/*!40101 SET character_set_client = @saved_cs_client */;

--
Expand Down Expand Up @@ -330,4 +330,4 @@ USE `amazon`;
/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
/*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;

-- Dump completed on 2024-02-02 19:05:53
-- Dump completed on 2024-02-04 12:07:28

0 comments on commit a41c831

Please sign in to comment.