摘要:從如何評(píng)價(jià)的話題下開(kāi)始抓取問(wèn)題,然后開(kāi)始爬相關(guān)問(wèn)題再循環(huán)對(duì)于每個(gè)問(wèn)題抓取標(biāo)題,關(guān)注人數(shù),回答數(shù)等數(shù)據(jù)設(shè)置用戶(hù)名和密碼設(shè)置獲取值獲得驗(yàn)證碼的地址準(zhǔn)備下載驗(yàn)證碼獲取請(qǐng)求下載驗(yàn)證碼打開(kāi)驗(yàn)證碼輸入驗(yàn)證碼請(qǐng)輸入驗(yàn)證碼輸入賬號(hào)和
從如何評(píng)價(jià)X的話題下開(kāi)始抓取問(wèn)題,然后開(kāi)始爬相關(guān)問(wèn)題再循環(huán)
對(duì)于每個(gè)問(wèn)題抓取 標(biāo)題,關(guān)注人數(shù),回答數(shù)等數(shù)據(jù)
zhihuTopicSpider.py
# -*- coding: utf-8 -*- import scrapy import os import time import re import json from ..items import zhihuQuestionItem # mode 1:tencent 2:free mode = 2 proxy = "https://web-proxy.oa.com:8080" if mode == 1 else "" # 設(shè)置 用戶(hù)名和密碼 email = "youremail" password = "yourpassword" class zhihu_topicSpider(scrapy.Spider): name = "zhihu_topicSpider" zhihu_url = "https://www.zhihu.com" login_url = "https://www.zhihu.com/login/email" topic = "https://www.zhihu.com/topic" domain = "https://www.zhihu.com" # 設(shè)置 Headers headers_dict = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8", "Connection": "keep-alive", "Host": "www.zhihu.com", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.109 Safari/537.36" } def start_requests(self): yield scrapy.Request( url=self.zhihu_url, headers=self.headers_dict, meta={ "proxy": proxy, "cookiejar": 1 }, callback=self.request_captcha ) def request_captcha(self, response): # 獲取_xsrf值 _xsrf = response.css("input[name="_xsrf"]::attr(value)").extract()[0] # 獲得驗(yàn)證碼的地址 captcha_url = "http://www.zhihu.com/captcha.gif?r=" + str(time.time() * 1000) # 準(zhǔn)備下載驗(yàn)證碼 # 獲取請(qǐng)求 yield scrapy.Request( url=captcha_url, headers=self.headers_dict, meta={ "proxy": proxy, "cookiejar": response.meta["cookiejar"], "_xsrf": _xsrf }, callback=self.download_captcha ) def download_captcha(self, response): # 下載驗(yàn)證碼 with open("captcha.gif", "wb") as fp: fp.write(response.body) # 打開(kāi)驗(yàn)證碼 os.system("open captcha.gif") # 輸入驗(yàn)證碼 print "請(qǐng)輸入驗(yàn)證碼: " captcha = raw_input() # 輸入賬號(hào)和密碼 yield scrapy.FormRequest( url=self.login_url, headers=self.headers_dict, formdata={ "email": email, "password": password, "_xsrf": response.meta["_xsrf"], "remember_me": "true", "captcha": captcha }, meta={ "proxy": proxy, "cookiejar": response.meta["cookiejar"], }, callback=self.request_zhihu ) def request_zhihu(self, response): """ 現(xiàn)在已經(jīng)登錄,請(qǐng)求www.zhihu.com的頁(yè)面 """ yield scrapy.Request(url=self.topic + "/19760570", headers=self.headers_dict, meta={ "proxy": proxy, "cookiejar": response.meta["cookiejar"], }, callback=self.get_topic_question, dont_filter=True) def get_topic_question(self, response): # with open("topic.html", "wb") as fp: # fp.write(response.body) # 獲得話題下的question的url question_urls = response.css(".question_link[target=_blank]::attr(href)").extract() length = len(question_urls) k = -1 j = 0 temp = [] for j in range(length/3): temp.append(question_urls[k+3]) j+=1 k+=3 for url in temp: yield scrapy.Request(url = self.zhihu_url+url, headers = self.headers_dict, meta = { "proxy": proxy, "cookiejar": response.meta["cookiejar"], }, callback = self.parse_question_data) def parse_question_data(self, response): item = zhihuQuestionItem() item["qid"] = re.search("d+",response.url).group() item["title"] = response.css(".zm-item-title::text").extract()[0].strip() item["answers_num"] = response.css("h3::attr(data-num)").extract()[0] question_nums = response.css(".zm-side-section-inner .zg-gray-normal strong::text").extract() item["followers_num"] = question_nums[0] item["visitsCount"] = question_nums[1] item["topic_views"] = question_nums[2] topic_tags = response.css(".zm-item-tag::text").extract() if len(topic_tags) >= 3: item["topic_tag0"] = topic_tags[0].strip() item["topic_tag1"] = topic_tags[1].strip() item["topic_tag2"] = topic_tags[2].strip() elif len(topic_tags) == 2: item["topic_tag0"] = topic_tags[0].strip() item["topic_tag1"] = topic_tags[1].strip() item["topic_tag2"] = "-" elif len(topic_tags) == 1: item["topic_tag0"] = topic_tags[0].strip() item["topic_tag1"] = "-" item["topic_tag2"] = "-" # print type(item["title"]) question_links = response.css(".question_link::attr(href)").extract() yield item for url in question_links: yield scrapy.Request(url = self.zhihu_url+url, headers = self.headers_dict, meta = { "proxy": proxy, "cookiejar": response.meta["cookiejar"], }, callback = self.parse_question_data)
pipelines.py
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don"t forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # import json import MySQLdb # class JsonDumpPipeline(object): # def process_item(self, item, spider): # with open("d.json", "a") as fp: # fp.write(json.dumps(dict(item), ensure_ascii = False).encode("utf-8") + " ") class MySQLPipeline(object): print " " sql_questions = ( "INSERT INTO questions(" "qid, title, answers_num, followers_num, visitsCount, topic_views, topic_tag0, topic_tag1, topic_tag2) " "VALUES ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")") count = 0 def open_spider(self, spider): host = "localhost" user = "root" password = "wangqi" dbname = "zh" self.conn = MySQLdb.connect(host, user, password, dbname) self.cursor = self.conn.cursor() self.conn.set_character_set("utf8") self.cursor.execute("SET NAMES utf8;") self.cursor.execute("SET CHARACTER SET utf8;") self.cursor.execute("SET character_set_connection=utf8;") print " MYSQL DB CURSOR INIT SUCCESS!! " sql = ( "CREATE TABLE IF NOT EXISTS questions (" "qid VARCHAR (100) NOT NULL," "title varchar(100)," "answers_num INT(11)," "followers_num INT(11) NOT NULL," "visitsCount INT(11)," "topic_views INT(11)," "topic_tag0 VARCHAR (600)," "topic_tag1 VARCHAR (600)," "topic_tag2 VARCHAR (600)," "PRIMARY KEY (qid)" ")") self.cursor.execute(sql) print " TABLES ARE READY! " def process_item(self, item, spider): sql = self.sql_questions % (item["qid"], item["title"], item["answers_num"],item["followers_num"], item["visitsCount"], item["topic_views"], item["topic_tag0"], item["topic_tag1"], item["topic_tag2"]) self.cursor.execute(sql) if self.count % 10 == 0: self.conn.commit() self.count += 1 print item["qid"] + " DATA COLLECTED!"
items.py
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html import scrapy from scrapy import Field class zhihuQuestionItem(scrapy.Item): qid = Field() title = Field() followers_num = Field() answers_num = Field() visitsCount = Field() topic_views = Field() topic_tag0 = Field() topic_tag1 = Field() topic_tag2 = Field()
文章版權(quán)歸作者所有,未經(jīng)允許請(qǐng)勿轉(zhuǎn)載,若此文章存在違規(guī)行為,您可以聯(lián)系管理員刪除。
轉(zhuǎn)載請(qǐng)注明本文地址:http://m.specialneedsforspecialkids.com/yun/37746.html
摘要:在抓取數(shù)據(jù)之前,請(qǐng)?jiān)跒g覽器中登錄過(guò)知乎,這樣才使得是有效的。所謂的模擬登陸,只是在中盡量的模擬在瀏覽器中的交互過(guò)程,使服務(wù)端無(wú)感抓包過(guò)程。若是幫你解決了問(wèn)題,或者給了你啟發(fā),不要吝嗇給加一星。 折騰了將近兩天,中間數(shù)次想要放棄,還好硬著頭皮搞下去了,在此分享出來(lái),希望有同等需求的各位能少走一些彎路。 源碼放在了github上, 歡迎前往查看。 若是幫你解決了問(wèn)題,或者給了你啟發(fā),不要吝...
摘要:很多人學(xué)習(xí)爬蟲(chóng)的第一驅(qū)動(dòng)力就是爬取各大網(wǎng)站的妹子圖片,比如比較有名的。最后我們只需要運(yùn)行程序,即可執(zhí)行爬取,程序運(yùn)行命名如下完整代碼我已上傳到微信公眾號(hào)后臺(tái),在癡海公眾號(hào)后臺(tái)回復(fù)即可獲取。本文首發(fā)于公眾號(hào)癡海,后臺(tái)回復(fù)即可獲取最新編程資源。 showImg(https://segmentfault.com/img/remote/1460000016780800); 閱讀文本大概需要 1...
摘要:話題精華即為知乎的高票回答。下面的項(xiàng)目中還包含了另外一個(gè)爬取的知乎的動(dòng)態(tài)。 作者:William本文為原創(chuàng)文章,轉(zhuǎn)載請(qǐng)注明作者及出處 Electron 可以讓你使用純 JavaScript 調(diào)用 Chrome 豐富的原生的接口來(lái)創(chuàng)造桌面應(yīng)用。你可以把它看作一個(gè)專(zhuān)注于桌面應(yīng)用的 Node.js 的變體,而不是 Web 服務(wù)器。其基于瀏覽器的應(yīng)用方式可以極方便的做各種響應(yīng)式的交互,接下來(lái)介...
摘要:楚江數(shù)據(jù)是專(zhuān)業(yè)的互聯(lián)網(wǎng)數(shù)據(jù)技術(shù)服務(wù),現(xiàn)整理出零基礎(chǔ)如何學(xué)爬蟲(chóng)技術(shù)以供學(xué)習(xí),。本文來(lái)源知乎作者路人甲鏈接楚江數(shù)據(jù)提供網(wǎng)站數(shù)據(jù)采集和爬蟲(chóng)軟件定制開(kāi)發(fā)服務(wù),服務(wù)范圍涵蓋社交網(wǎng)絡(luò)電子商務(wù)分類(lèi)信息學(xué)術(shù)研究等。 楚江數(shù)據(jù)是專(zhuān)業(yè)的互聯(lián)網(wǎng)數(shù)據(jù)技術(shù)服務(wù),現(xiàn)整理出零基礎(chǔ)如何學(xué)爬蟲(chóng)技術(shù)以供學(xué)習(xí),http://www.chujiangdata.com。 第一:Python爬蟲(chóng)學(xué)習(xí)系列教程(來(lái)源于某博主:htt...
摘要:今天為大家整理了個(gè)爬蟲(chóng)項(xiàng)目。地址新浪微博爬蟲(chóng)主要爬取新浪微博用戶(hù)的個(gè)人信息微博信息粉絲和關(guān)注。代碼獲取新浪微博進(jìn)行登錄,可通過(guò)多賬號(hào)登錄來(lái)防止新浪的反扒。涵蓋鏈家爬蟲(chóng)一文的全部代碼,包括鏈家模擬登錄代碼。支持微博知乎豆瓣。 showImg(https://segmentfault.com/img/remote/1460000018452185?w=1000&h=667); 今天為大家整...
閱讀 695·2021-11-22 09:34
閱讀 3830·2021-09-22 15:42
閱讀 1341·2021-09-03 10:28
閱讀 1078·2021-08-26 14:13
閱讀 1910·2019-08-29 15:41
閱讀 1435·2019-08-29 14:12
閱讀 3373·2019-08-26 18:36
閱讀 3315·2019-08-26 13:47