From bb939905a2eb8091da816a76b6022ed15249dfda Mon Sep 17 00:00:00 2001 From: jiangwei1995910 Date: Tue, 30 Jul 2019 22:00:17 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E9=93=BE=E5=AE=B6=E7=A7=9F?= =?UTF-8?q?=E6=88=BF=E6=95=B0=E6=8D=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Query.js | 97 ++++--- README.md | 107 +++----- build.sh | 17 +- config.yaml.all => config-all.yaml | 287 +++++++++++++++++++- config.yaml | 263 +++++++++++++++++- configs/config.go | 51 ++-- db/client.go | 28 +- db/save.go | 1 - clean_status.go => entrance/clean_status.go | 20 +- entrance/info.go | 82 ++++++ lianjia.go => entrance/lianjia.go | 15 +- entrance/lianjia_zufang.go | 185 +++++++++++++ zhilian.go => entrance/zhilian.go | 6 +- main.go | 54 ++++ numLog.txt | 11 + proxypool/proxy.go | 2 +- 16 files changed, 1040 insertions(+), 186 deletions(-) rename config.yaml.all => config-all.yaml (88%) rename clean_status.go => entrance/clean_status.go (75%) create mode 100644 entrance/info.go rename lianjia.go => entrance/lianjia.go (97%) create mode 100644 entrance/lianjia_zufang.go rename zhilian.go => entrance/zhilian.go (98%) create mode 100644 main.go create mode 100644 numLog.txt diff --git a/Query.js b/Query.js index ed710af..48c0e5b 100644 --- a/Query.js +++ b/Query.js @@ -1,52 +1,63 @@ +// 这里是一些常用的查询语句 + + // 房价均价查询语句 db.lianjia.aggregate([ - {'$match': {"address.0": {$exists: true}}}, - { - $group: { - _id: {"$arrayElemAt": ["$address", 0]}, - count: {$sum: 1}, - avg_UnitPrice: {$avg: "$UnitPrice"}, - std: {$stdDevPop: "$UnitPrice"}, - } - }, - { - $project: - { - count: 1, //总数 - avg_UnitPrice: 1, //每平米均价 - std: 1, //标准差 - ratio: {$divide: ["$std", "$avg_UnitPrice"]} //标准差与均价的比值 - } - }, - { - '$sort': {count: -1} - } + {'$match': {"address.0": {$exists: true}}}, + { + $group: { + _id: {"$arrayElemAt": ["$address", 0]}, + count: {$sum: 1}, + avg_UnitPrice: {$avg: "$UnitPrice"}, + std: {$stdDevPop: "$UnitPrice"}, + } + }, + { + $project: + { + count: 1, //总数 + avg_UnitPrice: 1, //每平米均价 + std: 1, //标准差 + ratio: {$divide: ["$std", "$avg_UnitPrice"]} //标准差与均价的比值 + } + }, + { + '$sort': {count: -1} + } ]); // 平均薪资查询语句 db.zhilian.aggregate([ - {'$match': {"workingExp.name": "1-3年"}}, - { - $group: { - _id: {"$arrayElemAt": ["$city.items", 0]}, - count: {$sum: 1}, - avg: {$avg: "$avg"}, - std: {$stdDevPop: "$avg"}, - } - }, - { - $project: - { - count: 1, //总数 - avg: 1, //平均薪资 - std: 1, //标准差 - ratio: {$divide: ["$std", "$avg"]} //标准差与均价的比值 - } - }, - { - '$sort': {count: -1} - } -]); \ No newline at end of file + {'$match': {"workingExp.name": "1-3年"}}, + { + $group: { + _id: {"$arrayElemAt": ["$city.items", 0]}, + count: {$sum: 1}, + avg: {$avg: "$avg"}, + std: {$stdDevPop: "$avg"}, + } + }, + { + $project: + { + count: 1, //总数 + avg: 1, //平均薪资 + std: 1, //标准差 + ratio: {$divide: ["$std", "$avg"]} //标准差与均价的比值 + } + }, + { + '$sort': {count: -1} + } +]); + + +// 当前进度 +db.lianjia.aggregate([ + { + '$sort': {detailCrawlTime: -1} + } +], {allowDiskUse: true}); \ No newline at end of file diff --git a/README.md b/README.md index 3e5a212..78b0d3f 100644 --- a/README.md +++ b/README.md @@ -7,94 +7,71 @@ > **注意!**\ > 1.本项目仅供学习研究,禁止用于任何商业项目\ -> 2.运行的时候为被爬方考虑下!尽量不要爬全站。请在配置文件中设置你需要的城市爬取即可! +> 2.运行的时候为被爬方考虑下!尽量不要爬全站。请在配置文件中设置你需要的城市爬取即可!\ +> 3.[项目主页](https://jiangwei1995910.github.io/getAwayBSG/)里面有现成数据,不需要你自己动手运行爬虫 - -## What! +## 啥? 如果你是一个正准备逃离北上广等一线城市却又找不到去处的IT人士,或许这个项目能给你点建议。 -## Desc - -或许你跟我一样困惑,为此我通过爬虫抓取了智联招聘跟链家这2个平台的全部数据。最终拿到了18W+全国各个城市的招聘数据与81W+全国各地的房屋成交记录数据。 - -其中,招聘数据我抓取了工作年限,公司名称,公司规模,公司类型,工作类型,创建时间,工作名称,结束时间,教育情况,薪资字段。 - -职位我使用了['php', 'java', 'python', 'c/c++', 'c#', 'mysql', 'oracle', 'javascript', 'linux', 'SQL', '软件', '程序员']作为关键词搜索。基本上涵盖了程序猿们绝大部分工作 - -对应房屋成交记录,我抓取了成交时间,成交价格,每平米均价,地址字段。 - -## 分析 - -### 综合分析 - -首先,房价和薪资都没法代表一个地方的生活成本情况。因此我使用了(月薪/每平米房价)的倒数值来表示一个城市的生活成本。这个值越大,表明这个地方的生活成本越高。结果如下图 -![](./docs/img/shcb.png) -结果很明显了,如果你想去一个安稳生活的地方,这个表中前几的城市都不错,买房压力较低,并且我相信经常逛Github的程序猿肯定都是平均薪资几倍的收入。反之,如果你想挑战人生的地狱模式,emmmmm - - -另外,也附上各个城市的月薪,每平米房价对比情况 -![](./docs/img/fjxz.png) - -### 工作机会 - -统计了各个规模公司的招聘数量 -![](./docs/img/gzjh.png) - - - -### 薪资 - -首先,我计算了各个城市的平均薪资情况,为什么不包含博士学历呢?因为智联上面写明要博士的职位很少(硕士其实也不多,只有几百的量),抓下来每个城市都是几十的量,这种数据不具有统计意义如下: - -![](./docs/img/avg.png) - -可见,硕士的薪资跟本科比并没多大差距。另外,别小看了拉萨,拉萨的薪资并不低,但是方差特别大,说明如果你愿意去拉萨,其实你的薪资会很高,平均薪资低是因为有很多1000 2000的工作拉下去的 - -其他都是意料之中的了 - - -其实,平均值并没有多大参考意义,因为被平均的东西太多了,比如工龄就是很重要一个,于是我又取了各工龄的情况,如下 - -![](./docs/img/workTime.png) - -### 房价 - -对于一个地方是否合适自己发展,房价非常重要,于是我也分析了各个城市的房价数据。 - -首先,也先来个平均 -![](./docs/img/avgRoom.png) +通过爬虫,抓取了链接、智联的工作,租房,二手房一系列数据,为你提供各城市的宏观分析数据 +## 安装 -其次,也来个近10年房价的走势图吧 -![](./docs/img/room.png) -似乎有个大致规律,上半年涨,下半年跌。 +从[releases](https://github.com/jiangwei1995910/getAwayBSG/releases)下载对应操作系统,对应平台的二进制文件和配置文件模板 +## 配置 -## How to run +打开配置文件你就知道了 -**这应该是你在Github上能找到的运行最简单的爬虫项目!** 如果你想要运行这个项目,在[releases](https://github.com/jiangwei1995910/getAwayBSG/releases)里面下载你需要的操作系统平台,修改配置文件,双击运行,搞定! +## 运行 +链家二手房数据抓取 -## 未完成 TODO +``` +getAwayBSG -config=config.yaml -lianjia_ershou +``` -1.Go语言重构后目前只写了链家和智联招聘的爬虫,自如的还没写 +链家租房数据抓取 -2.有空再加一些其他网站的数据进来 +``` +getAwayBSG -config=config.yaml -lianjia_zufang +``` -3.加入租房数据,计算各个城市的租售比 +智联招聘数据抓取 +``` +getAwayBSG -config=config.yaml -zhilian +``` +其他命令 +1.clean -## 2019-07-03 Update Log +清除缓存状态,抓取过程中会将抓取进度保存到mongodb,每次启动会从上次位置继续抓取,如果你需要清除缓存状态,执行 +``` +getAwayBSG -clean +``` +该命令支持脚本调用 +``` +getAwayBSG -clean [option] +``` -为了能够在树莓派(没错,我就是拿树莓派跑爬虫服务的)上面有更好的运行效率,使用了Golang语言重构了整个项目,数据库换为MongoDB +option支持:lianjia_ershou、zhilian、lianjia_zufang -爬取数据源修改:链家改为爬取二手房数据,而不是以前的二手房交易记录 +2.info +方便定时脚本记录抓取情况,使用info命令可以输出当前抓取数据量到文件 +``` +getAwayBSG -info +``` +3.help +输出支持的全部命令列表 +``` +getAwayBSG -help +``` \ No newline at end of file diff --git a/build.sh b/build.sh index 01a393e..9577fae 100644 --- a/build.sh +++ b/build.sh @@ -1,15 +1,2 @@ -CGO_ENABLED=0 GOOS=darwin GOARCH=amd64 go build -o bin/macos64/lianjia lianjia.go -CGO_ENABLED=0 GOOS=darwin GOARCH=amd64 go build -o bin/macos64/zhilian zhilian.go -CGO_ENABLED=0 GOOS=darwin GOARCH=amd64 go build -o bin/macos64/clean_status clean_status.go -cp ./config.yaml ./bin/macos64/config.yaml - -CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o bin/linux64/lianjia lianjia.go -CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o bin/linux64/zhilian zhilian.go -CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o bin/linux64/clean_status clean_status.go -cp ./config.yaml ./bin/linux64/config.yaml - - -CGO_ENABLED=0 GOOS=windows GOARCH=amd64 go build -o bin/windows64/lianjia.exe lianjia.go -CGO_ENABLED=0 GOOS=windows GOARCH=amd64 go build -o bin/windows64/zhilian.exe zhilian.go -CGO_ENABLED=0 GOOS=windows GOARCH=amd64 go build -o bin/windows64/clean_status.exe clean_status.go -cp ./config.yaml ./bin/windows64/config.yaml +xgo --targets=windows/*,darwin/*,linux/* -out ./bin/getAwayBSG ./ +cp ./config.yaml ./bin/config.yaml \ No newline at end of file diff --git a/config.yaml.all b/config-all.yaml similarity index 88% rename from config.yaml.all rename to config-all.yaml index 6d96394..3ad92f4 100644 --- a/config.yaml.all +++ b/config-all.yaml @@ -7,6 +7,8 @@ dbDatabase: pachong # 抓取状态记录,这个库不可改名,如果启用了权限管理需要添加这个库的权限 collyDatabase: colly +# 租房存储数据集合 +zufangCollection: lianjia_zufang # 抓取间隔时间 单位秒 crawlDelay: 3 @@ -20,8 +22,6 @@ dbCollection: lianjia # 自己添加positionURL 字段的唯一键,不然有重复 zlDBCollection: zhilian - - # 代理列表 # 注释掉则不使用代理(不使用代理偶尔被封,偶尔没事,抓慢点好像就没问题,我树莓派上面没问题,电脑上可能是太快了,会被封) #proxyList: @@ -30,7 +30,7 @@ zlDBCollection: zhilian - +# 智联招聘平台爬取关键词 zlKeyWords: - php - java @@ -45,14 +45,17 @@ zlKeyWords: - 软件开发 - 程序员 + +# !!!请尽量为被爬方考虑,不要抓全站,配置你需要抓取的城市即可! # 需要抓取房价城市 cityList: + - https://cd.lianjia.com/ershoufang/ #成都 + - https://bj.lianjia.com/ershoufang/ #北京 - https://aq.lianjia.com/ershoufang/ #安庆 - https://cz.fang.lianjia.com/ershoufang/ #滁州 - https://hf.lianjia.com/ershoufang/ #合肥 - https://mas.lianjia.com/ershoufang/ #马鞍山 - https://wuhu.lianjia.com/ershoufang/ #芜湖 - - https://bj.lianjia.com/ershoufang/ #北京 - https://cq.lianjia.com/ershoufang/ #重庆 - https://fz.lianjia.com/ershoufang/ #福州 - https://ly.fang.lianjia.com/ershoufang/ #龙岩 @@ -138,7 +141,6 @@ cityList: - https://hhht.lianjia.com/ershoufang/ #呼和浩特 - https://yinchuan.lianjia.com/ershoufang/ #银川 - https://sh.lianjia.com/ershoufang/ #上海 - - https://cd.lianjia.com/ershoufang/ #成都 - https://dy.fang.lianjia.com/ershoufang/ #德阳 - https://dazhou.lianjia.com/ershoufang/ #达州 - https://leshan.fang.lianjia.com/ershoufang/ #乐山 @@ -174,8 +176,270 @@ cityList: +# 需要抓取租房数据的城市 +zufangCityList: + - link: https://cd.lianjia.com/zufang/ + name: 成都 + - link: https://bj.lianjia.com/zufang/ + name: 北京 + - link: https://aq.lianjia.com/zufang/ + name: 安庆 + - link: https://cz.fang.lianjia.com/zufang/ + name: 滁州 + - link: https://hf.lianjia.com/zufang/ + name: 合肥 + - link: https://mas.lianjia.com/zufang/ + name: 马鞍山 + - link: https://wuhu.lianjia.com/zufang/ + name: 芜湖 + - link: https://cq.lianjia.com/zufang/ + name: 重庆 + - link: https://fz.lianjia.com/zufang/ + name: 福州 + - link: https://ly.fang.lianjia.com/zufang/ + name: 龙岩 + - link: https://quanzhou.lianjia.com/zufang/ + name: 泉州 + - link: https://xm.lianjia.com/zufang/ + name: 厦门 + - link: https://zhangzhou.lianjia.com/zufang/ + name: 漳州 + - link: https://dg.lianjia.com/zufang/ + name: 东莞 + - link: https://fs.lianjia.com/zufang/ + name: 佛山 + - link: https://gz.lianjia.com/zufang/ + name: 广州 + - link: https://hui.lianjia.com/zufang/ + name: 惠州 + - link: https://jiangmen.lianjia.com/zufang/ + name: 江门 + - link: https://qy.lianjia.com/zufang/ + name: 清远 + - link: https://sz.lianjia.com/zufang/ + name: 深圳 + - link: https://zh.lianjia.com/zufang/ + name: 珠海 + - link: https://zhanjiang.lianjia.com/zufang/ + name: 湛江 + - link: https://zs.lianjia.com/zufang/ + name: 中山 + - link: https://gy.lianjia.com/zufang/ + name: 贵阳 + - link: https://bh.lianjia.com/zufang/ + name: 北海 + - link: https://gl.lianjia.com/zufang/ + name: 桂林 + - link: https://liuzhou.lianjia.com/zufang/ + name: 柳州 + - link: https://nn.lianjia.com/zufang/ + name: 南宁 + - link: https://lz.lianjia.com/zufang/ + name: 兰州 + - link: https://huangshi.lianjia.com/zufang/ + name: 黄石 + - link: https://hg.fang.lianjia.com/zufang/ + name: 黄冈 + - link: https://wh.lianjia.com/zufang/ + name: 武汉 + - link: https://xy.lianjia.com/zufang/ + name: 襄阳 + - link: https://xn.fang.lianjia.com/zufang/ + name: 咸宁 + - link: https://yichang.lianjia.com/zufang/ + name: 宜昌 + - link: https://cs.lianjia.com/zufang/ + name: 长沙 + - link: https://changde.lianjia.com/zufang/ + name: 常德 + - link: https://yy.lianjia.com/zufang/ + name: 岳阳 + - link: https://zhuzhou.lianjia.com/zufang/ + name: 株洲 + - link: https://bd.lianjia.com/zufang/ + name: 保定 + - link: https://chengde.fang.lianjia.com/zufang/ + name: 承德 + - link: https://hd.fang.lianjia.com/zufang/ + name: 邯郸 + - link: https://hs.fang.lianjia.com/zufang/ + name: 衡水 + - link: https://lf.lianjia.com/zufang/ + name: 廊坊 + - link: https://qhd.fang.lianjia.com/zufang/ + name: 秦皇岛 + - link: https://sjz.lianjia.com/zufang/ + name: 石家庄 + - link: https://ts.lianjia.com/zufang/ + name: 唐山 + - link: https://xt.fang.lianjia.com/zufang/ + name: 邢台 + - link: https://zjk.lianjia.com/zufang/ + name: 张家口 + - link: https://bt.fang.lianjia.com/zufang/ + name: 保亭 + - link: https://cm.fang.lianjia.com/zufang/ + name: 澄迈 + - link: https://dz.fang.lianjia.com/zufang/ + name: 儋州 + - link: https://da.fang.lianjia.com/zufang/ + name: 定安 + - link: https://hk.lianjia.com/zufang/ + name: 海口 + - link: https://lg.fang.lianjia.com/zufang/ + name: 临高 + - link: https://ld.fang.lianjia.com/zufang/ + name: 乐东 + - link: https://ls.fang.lianjia.com/zufang/ + name: 陵水 + - link: https://qh.fang.lianjia.com/zufang/ + name: 琼海 + - link: https://qz.fang.lianjia.com/zufang/ + name: 琼中 + - link: https://san.lianjia.com/zufang/ + name: 三亚 + - link: https://wzs.fang.lianjia.com/zufang/ + name: 五指山 + - link: https://wc.fang.lianjia.com/zufang/ + name: 文昌 + - link: https://wn.fang.lianjia.com/zufang/ + name: 万宁 + - link: https://kf.lianjia.com/zufang/ + name: 开封 + - link: https://luoyang.lianjia.com/zufang/ + name: 洛阳 + - link: https://xinxiang.lianjia.com/zufang/ + name: 新乡 + - link: https://xc.lianjia.com/zufang/ + name: 许昌 + - link: https://zz.lianjia.com/zufang/ + name: 郑州 + - link: https://hrb.lianjia.com/zufang/ + name: 哈尔滨 + - link: https://changzhou.lianjia.com/zufang/ + name: 常州 + - link: https://ha.lianjia.com/zufang/ + name: 淮安 + - link: https://ks.lianjia.com/zufang/ + name: 昆山 + - link: https://nj.lianjia.com/zufang/ + name: 南京 + - link: https://nt.lianjia.com/zufang/ + name: 南通 + - link: https://su.lianjia.com/zufang/ + name: 苏州 + - link: https://wx.lianjia.com/zufang/ + name: 无锡 + - link: https://xz.lianjia.com/zufang/ + name: 徐州 + - link: https://yc.lianjia.com/zufang/ + name: 盐城 + - link: https://zj.lianjia.com/zufang/ + name: 镇江 + - link: https://cc.lianjia.com/zufang/ + name: 长春 + - link: https://jl.lianjia.com/zufang/ + name: 吉林 + - link: https://ganzhou.lianjia.com/zufang/ + name: 赣州 + - link: https://jiujiang.lianjia.com/zufang/ + name: 九江 + - link: https://jian.lianjia.com/zufang/ + name: 吉安 + - link: https://nc.lianjia.com/zufang/ + name: 南昌 + - link: https://sr.lianjia.com/zufang/ + name: 上饶 + - link: https://dl.lianjia.com/zufang/ + name: 大连 + - link: https://dd.lianjia.com/zufang/ + name: 丹东 + - link: https://sy.lianjia.com/zufang/ + name: 沈阳 + - link: https://hhht.lianjia.com/zufang/ + name: 呼和浩特 + - link: https://yinchuan.lianjia.com/zufang/ + name: 银川 + - link: https://sh.lianjia.com/zufang/ + name: 上海 + - link: https://dy.fang.lianjia.com/zufang/ + name: 德阳 + - link: https://dazhou.lianjia.com/zufang/ + name: 达州 + - link: https://leshan.fang.lianjia.com/zufang/ + name: 乐山 + - link: https://liangshan.lianjia.com/zufang/ + name: 凉山 + - link: https://mianyang.lianjia.com/zufang/ + name: 绵阳 + - link: https://ms.fang.lianjia.com/zufang/ + name: 眉山 + - link: https://nanchong.lianjia.com/zufang/ + name: 南充 + - link: https://jn.lianjia.com/zufang/ + name: 济南 + - link: https://linyi.lianjia.com/zufang/ + name: 临沂 + - link: https://qd.lianjia.com/zufang/ + name: 青岛 + - link: https://wf.lianjia.com/zufang/ + name: 潍坊 + - link: https://weihai.lianjia.com/zufang/ + name: 威海 + - link: https://yt.lianjia.com/zufang/ + name: 烟台 + - link: https://zb.lianjia.com/zufang/ + name: 淄博 + - link: https://baoji.lianjia.com/zufang/ + name: 宝鸡 + - link: https://hanzhong.lianjia.com/zufang/ + name: 汉中 + - link: https://xa.lianjia.com/zufang/ + name: 西安 + - link: https://xianyang.lianjia.com/zufang/ + name: 咸阳 + - link: https://jz.fang.lianjia.com/zufang/ + name: 晋中 + - link: https://ty.lianjia.com/zufang/ + name: 太原 + - link: https://tj.lianjia.com/zufang/ + name: 天津 + - link: https://dali.fang.lianjia.com//zufang/ + name: 大理 + - link: https://km.lianjia.com/zufang/ + name: 昆明 + - link: https://xsbn.fang.lianjia.com/zufang/ + name: 西双版纳 + - link: https://hz.lianjia.com/zufang/ + name: 杭州 + - link: https://huzhou.lianjia.com/zufang/ + name: 湖州 + - link: https://jx.lianjia.com/zufang/ + name: 嘉兴 + - link: https://jh.lianjia.com/zufang/ + name: 金华 + - link: https://nb.lianjia.com/zufang/ + name: 宁波 + - link: https://sx.lianjia.com/zufang/ + name: 绍兴 + - link: https://taizhou.lianjia.com/zufang/ + name: 台州 + - link: https://wz.lianjia.com/zufang/ + name: 温州 + + + + + + +# !!!请尽量为被爬方考虑,不要抓全站,配置你需要抓取的城市即可! # 需要抓取招聘信息的城市 zlCityList: + - name: 北京 + url: https://www.zhaopin.com/beijing/ + code: 530 + pinyin: beijing + priority: 1 - name: 鞍山 url: https://www.zhaopin.com/anshan/ code: 601 @@ -260,11 +524,6 @@ zlCityList: url: https://sou.zhaopin.com/Jobs/searchresult.ashx?jl=482&sm=0&p=1&sf=0 code: 482 pinyin: aodaliya - - name: 北京 - url: https://www.zhaopin.com/beijing/ - code: 530 - pinyin: beijing - priority: 1 - name: 包头 url: https://www.zhaopin.com/baotou/ code: 588 @@ -1256,6 +1515,10 @@ zlCityList: url: https://www.zhaopin.com/maoming/ code: 771 pinyin: maoming + - name: 蒙自市 + url: https://www.zhaopin.com/mengzishi/ + code: + pinyin: mengzishi - name: 满洲里 url: https://www.zhaopin.com/manzhouli/ code: 10157 @@ -2302,6 +2565,4 @@ zlCityList: - name: 遵化 url: https://www.zhaopin.com/zunhua/ code: 10143 - pinyin: zunhua - - + pinyin: zunhua \ No newline at end of file diff --git a/config.yaml b/config.yaml index 0dd6ac1..ff69ce1 100644 --- a/config.yaml +++ b/config.yaml @@ -7,6 +7,8 @@ dbDatabase: pachong # 抓取状态记录,这个库不可改名,如果启用了权限管理需要添加这个库的权限 collyDatabase: colly +# 租房存储数据集合 +zufangCollection: lianjia_zufang # 抓取间隔时间 单位秒 crawlDelay: 3 @@ -173,6 +175,263 @@ cityList: # - https://wz.lianjia.com/ershoufang/ #温州 + +# 需要抓取租房数据的城市 +zufangCityList: + - link: https://cd.lianjia.com/zufang/ + name: 成都 + - link: https://bj.lianjia.com/zufang/ + name: 北京 +# - link: https://aq.lianjia.com/zufang/ +# name: 安庆 +# - link: https://cz.fang.lianjia.com/zufang/ +# name: 滁州 +# - link: https://hf.lianjia.com/zufang/ +# name: 合肥 +# - link: https://mas.lianjia.com/zufang/ +# name: 马鞍山 +# - link: https://wuhu.lianjia.com/zufang/ +# name: 芜湖 +# - link: https://cq.lianjia.com/zufang/ +# name: 重庆 +# - link: https://fz.lianjia.com/zufang/ +# name: 福州 +# - link: https://ly.fang.lianjia.com/zufang/ +# name: 龙岩 +# - link: https://quanzhou.lianjia.com/zufang/ +# name: 泉州 +# - link: https://xm.lianjia.com/zufang/ +# name: 厦门 +# - link: https://zhangzhou.lianjia.com/zufang/ +# name: 漳州 +# - link: https://dg.lianjia.com/zufang/ +# name: 东莞 +# - link: https://fs.lianjia.com/zufang/ +# name: 佛山 +# - link: https://gz.lianjia.com/zufang/ +# name: 广州 +# - link: https://hui.lianjia.com/zufang/ +# name: 惠州 +# - link: https://jiangmen.lianjia.com/zufang/ +# name: 江门 +# - link: https://qy.lianjia.com/zufang/ +# name: 清远 +# - link: https://sz.lianjia.com/zufang/ +# name: 深圳 +# - link: https://zh.lianjia.com/zufang/ +# name: 珠海 +# - link: https://zhanjiang.lianjia.com/zufang/ +# name: 湛江 +# - link: https://zs.lianjia.com/zufang/ +# name: 中山 +# - link: https://gy.lianjia.com/zufang/ +# name: 贵阳 +# - link: https://bh.lianjia.com/zufang/ +# name: 北海 +# - link: https://gl.lianjia.com/zufang/ +# name: 桂林 +# - link: https://liuzhou.lianjia.com/zufang/ +# name: 柳州 +# - link: https://nn.lianjia.com/zufang/ +# name: 南宁 +# - link: https://lz.lianjia.com/zufang/ +# name: 兰州 +# - link: https://huangshi.lianjia.com/zufang/ +# name: 黄石 +# - link: https://hg.fang.lianjia.com/zufang/ +# name: 黄冈 +# - link: https://wh.lianjia.com/zufang/ +# name: 武汉 +# - link: https://xy.lianjia.com/zufang/ +# name: 襄阳 +# - link: https://xn.fang.lianjia.com/zufang/ +# name: 咸宁 +# - link: https://yichang.lianjia.com/zufang/ +# name: 宜昌 +# - link: https://cs.lianjia.com/zufang/ +# name: 长沙 +# - link: https://changde.lianjia.com/zufang/ +# name: 常德 +# - link: https://yy.lianjia.com/zufang/ +# name: 岳阳 +# - link: https://zhuzhou.lianjia.com/zufang/ +# name: 株洲 +# - link: https://bd.lianjia.com/zufang/ +# name: 保定 +# - link: https://chengde.fang.lianjia.com/zufang/ +# name: 承德 +# - link: https://hd.fang.lianjia.com/zufang/ +# name: 邯郸 +# - link: https://hs.fang.lianjia.com/zufang/ +# name: 衡水 +# - link: https://lf.lianjia.com/zufang/ +# name: 廊坊 +# - link: https://qhd.fang.lianjia.com/zufang/ +# name: 秦皇岛 +# - link: https://sjz.lianjia.com/zufang/ +# name: 石家庄 +# - link: https://ts.lianjia.com/zufang/ +# name: 唐山 +# - link: https://xt.fang.lianjia.com/zufang/ +# name: 邢台 +# - link: https://zjk.lianjia.com/zufang/ +# name: 张家口 +# - link: https://bt.fang.lianjia.com/zufang/ +# name: 保亭 +# - link: https://cm.fang.lianjia.com/zufang/ +# name: 澄迈 +# - link: https://dz.fang.lianjia.com/zufang/ +# name: 儋州 +# - link: https://da.fang.lianjia.com/zufang/ +# name: 定安 +# - link: https://hk.lianjia.com/zufang/ +# name: 海口 +# - link: https://lg.fang.lianjia.com/zufang/ +# name: 临高 +# - link: https://ld.fang.lianjia.com/zufang/ +# name: 乐东 +# - link: https://ls.fang.lianjia.com/zufang/ +# name: 陵水 +# - link: https://qh.fang.lianjia.com/zufang/ +# name: 琼海 +# - link: https://qz.fang.lianjia.com/zufang/ +# name: 琼中 +# - link: https://san.lianjia.com/zufang/ +# name: 三亚 +# - link: https://wzs.fang.lianjia.com/zufang/ +# name: 五指山 +# - link: https://wc.fang.lianjia.com/zufang/ +# name: 文昌 +# - link: https://wn.fang.lianjia.com/zufang/ +# name: 万宁 +# - link: https://kf.lianjia.com/zufang/ +# name: 开封 +# - link: https://luoyang.lianjia.com/zufang/ +# name: 洛阳 +# - link: https://xinxiang.lianjia.com/zufang/ +# name: 新乡 +# - link: https://xc.lianjia.com/zufang/ +# name: 许昌 +# - link: https://zz.lianjia.com/zufang/ +# name: 郑州 +# - link: https://hrb.lianjia.com/zufang/ +# name: 哈尔滨 +# - link: https://changzhou.lianjia.com/zufang/ +# name: 常州 +# - link: https://ha.lianjia.com/zufang/ +# name: 淮安 +# - link: https://ks.lianjia.com/zufang/ +# name: 昆山 +# - link: https://nj.lianjia.com/zufang/ +# name: 南京 +# - link: https://nt.lianjia.com/zufang/ +# name: 南通 +# - link: https://su.lianjia.com/zufang/ +# name: 苏州 +# - link: https://wx.lianjia.com/zufang/ +# name: 无锡 +# - link: https://xz.lianjia.com/zufang/ +# name: 徐州 +# - link: https://yc.lianjia.com/zufang/ +# name: 盐城 +# - link: https://zj.lianjia.com/zufang/ +# name: 镇江 +# - link: https://cc.lianjia.com/zufang/ +# name: 长春 +# - link: https://jl.lianjia.com/zufang/ +# name: 吉林 +# - link: https://ganzhou.lianjia.com/zufang/ +# name: 赣州 +# - link: https://jiujiang.lianjia.com/zufang/ +# name: 九江 +# - link: https://jian.lianjia.com/zufang/ +# name: 吉安 +# - link: https://nc.lianjia.com/zufang/ +# name: 南昌 +# - link: https://sr.lianjia.com/zufang/ +# name: 上饶 +# - link: https://dl.lianjia.com/zufang/ +# name: 大连 +# - link: https://dd.lianjia.com/zufang/ +# name: 丹东 +# - link: https://sy.lianjia.com/zufang/ +# name: 沈阳 +# - link: https://hhht.lianjia.com/zufang/ +# name: 呼和浩特 +# - link: https://yinchuan.lianjia.com/zufang/ +# name: 银川 +# - link: https://sh.lianjia.com/zufang/ +# name: 上海 +# - link: https://dy.fang.lianjia.com/zufang/ +# name: 德阳 +# - link: https://dazhou.lianjia.com/zufang/ +# name: 达州 +# - link: https://leshan.fang.lianjia.com/zufang/ +# name: 乐山 +# - link: https://liangshan.lianjia.com/zufang/ +# name: 凉山 +# - link: https://mianyang.lianjia.com/zufang/ +# name: 绵阳 +# - link: https://ms.fang.lianjia.com/zufang/ +# name: 眉山 +# - link: https://nanchong.lianjia.com/zufang/ +# name: 南充 +# - link: https://jn.lianjia.com/zufang/ +# name: 济南 +# - link: https://linyi.lianjia.com/zufang/ +# name: 临沂 +# - link: https://qd.lianjia.com/zufang/ +# name: 青岛 +# - link: https://wf.lianjia.com/zufang/ +# name: 潍坊 +# - link: https://weihai.lianjia.com/zufang/ +# name: 威海 +# - link: https://yt.lianjia.com/zufang/ +# name: 烟台 +# - link: https://zb.lianjia.com/zufang/ +# name: 淄博 +# - link: https://baoji.lianjia.com/zufang/ +# name: 宝鸡 +# - link: https://hanzhong.lianjia.com/zufang/ +# name: 汉中 +# - link: https://xa.lianjia.com/zufang/ +# name: 西安 +# - link: https://xianyang.lianjia.com/zufang/ +# name: 咸阳 +# - link: https://jz.fang.lianjia.com/zufang/ +# name: 晋中 +# - link: https://ty.lianjia.com/zufang/ +# name: 太原 +# - link: https://tj.lianjia.com/zufang/ +# name: 天津 +# - link: https://dali.fang.lianjia.com//zufang/ +# name: 大理 +# - link: https://km.lianjia.com/zufang/ +# name: 昆明 +# - link: https://xsbn.fang.lianjia.com/zufang/ +# name: 西双版纳 +# - link: https://hz.lianjia.com/zufang/ +# name: 杭州 +# - link: https://huzhou.lianjia.com/zufang/ +# name: 湖州 +# - link: https://jx.lianjia.com/zufang/ +# name: 嘉兴 +# - link: https://jh.lianjia.com/zufang/ +# name: 金华 +# - link: https://nb.lianjia.com/zufang/ +# name: 宁波 +# - link: https://sx.lianjia.com/zufang/ +# name: 绍兴 +# - link: https://taizhou.lianjia.com/zufang/ +# name: 台州 +# - link: https://wz.lianjia.com/zufang/ +# name: 温州 + + + + + + # !!!请尽量为被爬方考虑,不要抓全站,配置你需要抓取的城市即可! # 需要抓取招聘信息的城市 zlCityList: @@ -2306,6 +2565,4 @@ zlCityList: # - name: 遵化 # url: https://www.zhaopin.com/zunhua/ # code: 10143 -# pinyin: zunhua - - +# pinyin: zunhua \ No newline at end of file diff --git a/configs/config.go b/configs/config.go index c487ef8..a0f52f5 100644 --- a/configs/config.go +++ b/configs/config.go @@ -12,42 +12,33 @@ type singleton struct { } var instance *singleton +var config_path string -func init() { +func GetInstance() *singleton { if instance == nil { - instance = new(singleton) - dir, _ := filepath.Abs(filepath.Dir(os.Args[0])) - err := config.LoadFile(dir + "/config.yaml") - if err != nil { - err = config.LoadFile("./config.yaml") + if config_path == "" { + instance = new(singleton) + dir, _ := filepath.Abs(filepath.Dir(os.Args[0])) + err := config.LoadFile(dir + "/config.yaml") if err != nil { - fmt.Println("加载配置文件错误!!请确认当前目录下包含config.yaml文件") - fmt.Println(err) + err = config.LoadFile("./config.yaml") + if err != nil { + fmt.Println("加载配置文件错误!!请确认当前目录下包含config.yaml文件或者指定配置文件参数") + fmt.Println(err) + } } - } - conf := config.Map() - instance.configInfo = conf - - fmt.Println(conf) - } - -} - -func GetInstance() *singleton { - if instance == nil { - instance = new(singleton) - dir, _ := filepath.Abs(filepath.Dir(os.Args[0])) - err := config.LoadFile(dir + "/config.yaml") - if err != nil { - err = config.LoadFile("./config.yaml") + conf := config.Map() + instance.configInfo = conf + } else { + instance = new(singleton) + err := config.LoadFile(config_path) if err != nil { - fmt.Println("加载配置文件错误!!请确认当前目录下包含config.yaml文件") + fmt.Println("加载配置文件错误!!请确认当前目录下包含config.yaml文件或者指定配置文件参数") fmt.Println(err) } + conf := config.Map() + instance.configInfo = conf } - conf := config.Map() - instance.configInfo = conf - fmt.Println(conf) } return instance @@ -57,3 +48,7 @@ func Config() map[string]interface{} { return GetInstance().configInfo } + +func SetConfig(path string) { + config_path = path +} diff --git a/db/client.go b/db/client.go index b4e60ae..0673faf 100644 --- a/db/client.go +++ b/db/client.go @@ -102,10 +102,36 @@ func SetZhilianStatus(cityIndex int, kwIndex int) { lianjia_status.InsertOne(ctx, bson.M{"city_index": cityIndex, "kw_index": kwIndex}) } +func GetLianjiaZuFangStatus() int { + client := GetInstance().client + ctx := GetInstance().ctx + configInfo := configs.Config() + db := client.Database(configInfo["dbDatabase"].(string)) + lianjia_status := db.Collection("lianjiazf_status") + var res bson.M + err := lianjia_status.FindOne(ctx, bson.M{}).Decode(&res) + if err != nil { + return 0 + } + + index := res["index"].(int32) + return int(index) +} + +func SetLianjiaZuFangStatus(i int) { + client := GetInstance().client + ctx := GetInstance().ctx + configInfo := configs.Config() + db := client.Database(configInfo["dbDatabase"].(string)) + lianjia_status := db.Collection("lianjiazf_status") + lianjia_status.DeleteMany(ctx, bson.M{}) + lianjia_status.InsertOne(ctx, bson.M{"index": i}) +} + func GetCtx() context.Context { return GetInstance().ctx } func GetClient() *mongo.Client { return GetInstance().client -} \ No newline at end of file +} diff --git a/db/save.go b/db/save.go index f7158d9..f51580d 100644 --- a/db/save.go +++ b/db/save.go @@ -106,7 +106,6 @@ func AddZLItem(items []interface{}) { _, err := lianjia.InsertMany(ctx, items) if err != nil { if !strings.Contains(err.Error(), "multiple write errors") { - fmt.Print("数据库插入失败!") fmt.Println(err) } } diff --git a/clean_status.go b/entrance/clean_status.go similarity index 75% rename from clean_status.go rename to entrance/clean_status.go index 310e433..332e63d 100644 --- a/clean_status.go +++ b/entrance/clean_status.go @@ -1,4 +1,4 @@ -package main +package entrance import ( "context" @@ -13,18 +13,20 @@ import ( "time" ) -func main() { - +func Start_clean() { var choice int - if len(os.Args) > 1 && strings.Index(os.Args[1], "lianjia") > -1 { + if strings.Index(strings.Join(os.Args, ""), "lianjia_ershou") > -1 { choice = 1 - } else if len(os.Args) > 1 && strings.Index(os.Args[1], "zhilian") > -1 { + } else if strings.Index(strings.Join(os.Args, ""), "zhilian") > -1 { choice = 2 + } else if strings.Index(strings.Join(os.Args, ""), "lianjia_zufang") > -1 { + choice = 3 } else { fmt.Println("清除抓取状态(不清除状态的话爬虫会从上次停止位置继续抓取)") fmt.Println("请选择需要清除哪个爬虫的的状态数据:(输入数字)") - fmt.Println("1.链家") + fmt.Println("1.链家二手房") fmt.Println("2.智联") + fmt.Println("3.链家租房") fmt.Scanln(&choice) } @@ -36,6 +38,10 @@ func main() { } else if choice == 2 { db.SetZhilianStatus(0, 0) fmt.Println("Done!") + } else if choice == 3 { + db.SetLianjiaZuFangStatus(0) + clean_visit() + fmt.Println("Done!") } else { fmt.Println("选择错误!") } @@ -66,4 +72,4 @@ func clean_visit() { fmt.Println(err) } -} \ No newline at end of file +} diff --git a/entrance/info.go b/entrance/info.go new file mode 100644 index 0000000..dbf2d7b --- /dev/null +++ b/entrance/info.go @@ -0,0 +1,82 @@ +package entrance + +import ( + "getAwayBSG/configs" + "getAwayBSG/db" + "go.mongodb.org/mongo-driver/bson" + "os" + "strconv" + "strings" + "time" +) + +func Start_info() { + + fd, _ := os.OpenFile("./numLog.txt", os.O_RDWR|os.O_CREATE|os.O_APPEND, 0644) + fd_time := time.Now().Format("2006-01-02 15:04:05") + fd_content := strings.Join([]string{ + fd_time, ":\n", + getLianjiaErShouFangStatus(), "\n", + getLianJiaZuFangStatus(), "\n", + getZhiLianStatus(), "\n", + }, "") + buf := []byte(fd_content) + fd.Write(buf) + fd.Close() + +} + +func getLianjiaErShouFangStatus() string { + configInfo := configs.Config() + client := db.GetClient() + ctx := db.GetCtx() + + odb := client.Database(configInfo["dbDatabase"].(string)) + lianjia := odb.Collection(configInfo["dbCollection"].(string)) + lianjia_status := odb.Collection("lianjia_status") + var info bson.M + + res := lianjia_status.FindOne(ctx, bson.M{}) + res.Decode(&info) + detailNum, _ := lianjia.CountDocuments(ctx, bson.M{"address": bson.M{"$exists": true}}) + allNum, _ := lianjia.CountDocuments(ctx, bson.M{}) + + return "链家二手房:详情数" + strconv.Itoa(int(detailNum)) + "总数:" + strconv.Itoa(int(allNum)) + " index:" + strconv.Itoa(int(info["index"].(int32))); +} + +func getZhiLianStatus() string { + configInfo := configs.Config() + client := db.GetClient() + ctx := db.GetCtx() + + odb := client.Database(configInfo["dbDatabase"].(string)) + zhilian := odb.Collection(configInfo["zlDBCollection"].(string)) + zhilian_status := odb.Collection("zhilian_status") + + zhilianNum, _ := zhilian.CountDocuments(ctx, bson.M{}) + var info bson.M + res := zhilian_status.FindOne(ctx, bson.M{}) + res.Decode(&info) + + return "智联总数:" + strconv.Itoa(int(zhilianNum)) + " city_index:" + strconv.Itoa(int(info["city_index"].(int32))) + " kw_index:" + strconv.Itoa(int(info["kw_index"].(int32))) +} + +func getLianJiaZuFangStatus() string { + configInfo := configs.Config() + client := db.GetClient() + ctx := db.GetCtx() + + odb := client.Database(configInfo["dbDatabase"].(string)) + lianjiaZf := odb.Collection(configInfo["zufangCollection"].(string)) + lianjiaZFStatus := odb.Collection("lianjiazf_status") + + var info bson.M + + res := lianjiaZFStatus.FindOne(ctx, bson.M{}) + + res.Decode(&info) + + allNum, _ := lianjiaZf.CountDocuments(ctx, bson.M{}) + + return "链家租房:总数" + strconv.Itoa(int(allNum)) + " index:" + strconv.Itoa(int(info["index"].(int32))) +} diff --git a/lianjia.go b/entrance/lianjia.go similarity index 97% rename from lianjia.go rename to entrance/lianjia.go index 80e7b11..4916ab5 100644 --- a/lianjia.go +++ b/entrance/lianjia.go @@ -1,4 +1,4 @@ -package main +package entrance import ( "encoding/json" @@ -115,7 +115,8 @@ func crawlerOneCity(cityUrl string) { } re, _ := regexp.Compile("pg\\d+/*") goUrl = re.ReplaceAllString(goUrl, "") - c.Visit(goUrl) + err = c.Visit(goUrl) + fmt.Println(err) }) // 下一页 @@ -128,14 +129,16 @@ func crawlerOneCity(cityUrl string) { re, _ := regexp.Compile("pg\\d+/*") gourl = re.ReplaceAllString(element.Request.URL.String(), "") gourl = gourl + "pg" + strconv.Itoa(page.CurPage+1) - c.Visit(gourl) + err = c.Visit(gourl) + fmt.Println(err) } } }) }) - c.Visit(cityUrl) + err := c.Visit(cityUrl) + fmt.Println(err) } @@ -259,7 +262,7 @@ func crawlDetail() (sucnum int) { return sucnum } -func main() { +func Start_lianjia_ershou() { listFlag := make(chan int) detailFlag := make(chan int) @@ -284,4 +287,4 @@ func main() { <-listFlag <-detailFlag -} \ No newline at end of file +} diff --git a/entrance/lianjia_zufang.go b/entrance/lianjia_zufang.go new file mode 100644 index 0000000..1a74321 --- /dev/null +++ b/entrance/lianjia_zufang.go @@ -0,0 +1,185 @@ +package entrance + +import ( + "encoding/json" + "fmt" + "getAwayBSG/configs" + "getAwayBSG/db" + "github.com/gocolly/colly" + "github.com/gocolly/colly/extensions" + "github.com/gocolly/colly/proxy" + cachemongo "github.com/zolamk/colly-mongo-storage/colly/mongo" + "go.mongodb.org/mongo-driver/bson" + "net/url" + "regexp" + "strconv" + "strings" + "time" +) + +func TcrawlerOneCityZuFang(cityUrl string, cityname string) { + c := colly.NewCollector() + configInfo := configs.Config() + + if configInfo["crawlDelay"] != nil { + delay, _ := configInfo["crawlDelay"].(json.Number).Int64() + if delay > 0 { + c.Limit(&colly.LimitRule{ + DomainGlob: "*", + Delay: time.Duration(delay) * time.Second, + }) + } + } + + if configInfo["proxyList"] != nil && len(configInfo["proxyList"].([]interface{})) > 0 { + var proxyList []string + for _, v := range configInfo["proxyList"].([]interface{}) { + proxyList = append(proxyList, v.(string)) + } + + if configInfo["proxyList"] != nil { + rp, err := proxy.RoundRobinProxySwitcher(proxyList...) + if err != nil { + fmt.Println(err) + } + c.SetProxyFunc(rp) + } + } + extensions.RandomUserAgent(c) + extensions.Referer(c) + storage := &cachemongo.Storage{ + Database: "colly", + URI: configInfo["dburl"].(string) + "/colly", + } + if err := c.SetStorage(storage); err != nil { + panic(err) + } + c.OnRequest(func(r *colly.Request) { + fmt.Println("列表抓取:", r.URL.String()) + }) + + c.OnHTML("title", func(element *colly.HTMLElement) { + fmt.Println(element.Text) + }) + + c.OnHTML(".content__list--item", func(element *colly.HTMLElement) { + + var link string + var title string + var address string + var area string + var price int + element.ForEach(".twoline a", func(i int, element *colly.HTMLElement) { + link = "https://" + element.Request.URL.Host + element.Attr("href") + title = strings.TrimSpace(element.Text) + }) + + element.ForEach(".content__list--item--des a", func(i int, element *colly.HTMLElement) { + if i == 0 { + address = element.Text + } else { + area = element.Text + } + }) + + element.ForEach(".content__list--item-price em", func(i int, element *colly.HTMLElement) { + var err error + price, err = strconv.Atoi(element.Text) + if err != nil { + price = 0 + } + }) + + fmt.Println(price) + fmt.Println(link) + fmt.Println(title) + fmt.Println(address) + fmt.Println(area) + fmt.Println(cityname) + fmt.Println("--------------------") + + client := db.GetClient() + ctx := db.GetCtx() + + db := client.Database(configInfo["dbDatabase"].(string)) + lianjia := db.Collection(configInfo["zufangCollection"].(string)) + _, err := lianjia.InsertOne(ctx, bson.M{ + "Link": link, + "title": title, + "address": address, + "area": area, + "price": price, + "city": cityname, + "crawl_time": time.Now(), + }) + if err != nil { + if !strings.Contains(err.Error(), "multiple write errors") { + fmt.Print("数据库插入失败!") + fmt.Println(err) + } + } + + }) + + c.OnHTML(".content__pg", func(element *colly.HTMLElement) { + totalPage := element.Attr("data-totalpage") + iTotalPage, err := strconv.Atoi(totalPage) + + if err == nil { + for i := 2; i < iTotalPage; i++ { + + re, _ := regexp.Compile("pg\\d+/*") + goUrl := re.ReplaceAllString(element.Request.URL.String(), "") + + err = c.Visit(goUrl + "pg" + strconv.Itoa(i) + "/") + if err.Error() != "URL already visited" { + fmt.Println(err) + } + + } + } + }) + + c.OnHTML(".filter a", func(element *colly.HTMLElement) { + //// 切换地点 + u, err := url.Parse(cityUrl) + if err != nil { + panic(err) + } + rootUrl := u.Scheme + "://" + u.Host + goUrl := element.Attr("href") + u, err = url.Parse(goUrl) + if err != nil && err.Error() != "URL already visited" { + fmt.Println(err) + } + if u.Scheme == "" { + goUrl = rootUrl + u.Path + } else { + goUrl = u.String() + } + re, _ := regexp.Compile("pg\\d+/*") + goUrl = re.ReplaceAllString(goUrl, "") + err = c.Visit(goUrl) + if err != nil && err.Error() != "URL already visited" { + fmt.Println(err) + } + + }) + + err := c.Visit(cityUrl) + if err != nil && err.Error() != "URL already visited" { + fmt.Println(err) + } + +} + +func Start_LianjiaZufang() { + configinfo := configs.Config() + + cityList := configinfo["zufangCityList"].([]interface{}) + + for i := db.GetLianjiaZuFangStatus(); i < len(cityList); i++ { + TcrawlerOneCityZuFang(cityList[i].(map[string]interface{})["link"].(string), cityList[i].(map[string]interface{})["name"].(string)) + db.SetLianjiaZuFangStatus(i) + } +} diff --git a/zhilian.go b/entrance/zhilian.go similarity index 98% rename from zhilian.go rename to entrance/zhilian.go index 40bd8e8..a232aa6 100644 --- a/zhilian.go +++ b/entrance/zhilian.go @@ -1,4 +1,4 @@ -package main +package entrance import ( "crypto/tls" @@ -14,7 +14,7 @@ import ( "time" ) -func main() { +func Start_zhilian() { configInfo := configs.Config() keys := configInfo["zlKeyWords"].([]interface{}) cityList := configInfo["zlCityList"].([]interface{}) @@ -115,4 +115,4 @@ func get(link string) (bodystr string) { } return bodystr -} \ No newline at end of file +} diff --git a/main.go b/main.go new file mode 100644 index 0000000..2a5d840 --- /dev/null +++ b/main.go @@ -0,0 +1,54 @@ +package main + +import ( + "flag" + "fmt" + "getAwayBSG/configs" + "getAwayBSG/entrance" +) + +// 实际中应该用更好的变量名 +var ( + help bool + config string + lianjia_ershou bool + lianjia_zufang bool + zhilian bool + clean bool + info bool +) + +func init() { + flag.BoolVar(&help, "help", false, "显示帮助") + flag.StringVar(&config, "config", "./config.yaml", "设置配置文件") + flag.BoolVar(&lianjia_ershou, "lianjia_ershou", false, "抓取链家二手房数据") + flag.BoolVar(&lianjia_zufang, "lianjia_zufang", false, "抓取链家租房数据") + flag.BoolVar(&zhilian, "zhilian", false, "抓取智联招聘数据") + flag.BoolVar(&clean, "clean", false, "清理缓存") + flag.BoolVar(&info, "info", false, "保存抓取状态") +} + +func main() { + flag.Parse() + if config != "" { + configs.SetConfig(config) + } + fmt.Println(configs.Config()) + + if help { + flag.Usage() + } else if lianjia_ershou { + entrance.Start_lianjia_ershou() + } else if lianjia_zufang { + entrance.Start_LianjiaZufang() + } else if zhilian { + entrance.Start_zhilian() + } else if clean { + entrance.Start_clean() + } else if info { + entrance.Start_info() + } else { + flag.Usage() + } + +} diff --git a/numLog.txt b/numLog.txt new file mode 100644 index 0000000..262154b --- /dev/null +++ b/numLog.txt @@ -0,0 +1,11 @@ +======2019-07-12 09:20:26=====链接:详情数166529总数:378005;智联总数:120759 +======2019-07-12 09:20:46=====链接:详情数166538总数:378052;智联总数:120759 +======2019-07-29 10:12:01=====链接:详情数552711总数:1328148;智联总数:134898 index:14 city_index:367 kw_index:1 +2019-07-30 14:10:15: +链家二手房:详情数554007总数:1332650 index:0 +链家租房:总数9675index:0 +智联总数:137323 city_index:0 kw_index:0 +2019-07-30 14:11:08: +链家二手房:详情数554007总数:1332650 index:0 +链家租房:总数10003 index:0 +智联总数:137323 city_index:0 kw_index:0 diff --git a/proxypool/proxy.go b/proxypool/proxy.go index 765efcb..b40b30c 100644 --- a/proxypool/proxy.go +++ b/proxypool/proxy.go @@ -60,4 +60,4 @@ func getOneProxy() (string, string) { proxy := "http://" + string(body) fmt.Println("使用默认代理:" + proxy) return proxy, string(body) -} \ No newline at end of file +}