创建项目
scrapy startproject ithome
创建CrawSpider
scrapy genspider -t crawl IT ithome.com
items.py
1 import scrapy2 3 4 class IthomeItem(scrapy.Item):5 # define the fields for your item here like:6 # name = scrapy.Field()7 title = scrapy.Field()8 content = scrapy.Field()
it.py
1 import scrapy 2 from scrapy.linkextractors import LinkExtractor 3 from scrapy.spiders import CrawlSpider, Rule 4 from ithome.items import IthomeItem 5 6 class ItSpider(CrawlSpider): 7 name = 'it' 8 allowed_domains = ['ithome.com'] 9 start_urls = ['https://it.ithome.com/ityejie/']10 11 rules = (12 Rule(LinkExtractor(allow=r'ityejie/'), follow=True),13 Rule(LinkExtractor(allow=r'html/it/\d+.htm', restrict_xpaths='//*[@id="wrapper"]//*[@class="block"]'), callback='parse_item', follow=True),14 )15 16 def parse_item(self, response):17 i = {}18 #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()19 #i['name'] = response.xpath('//div[@id="name"]').extract()20 #i['description'] = response.xpath('//div[@id="description"]').extract()21 22 i['title'] = response.xpath('//*[@id="wrapper"]/div[1]/div[2]/h1/text()').extract()[0]23 i['content'] = response.xpath('//*[@id="paragraph"]/p/text()').extract()24 yield i
pipelines.py
1 import json 2 3 class IthomePipeline(object): 4 5 6 def __init__(self): 7 self.filename = open("it.json", "w") 8 9 def process_item(self, item, spider):10 text = json.dumps(dict(item), ensure_ascii = False) + ",\n"11 self.filename.write(text)12 return item13 14 def close_spider(self, spider):15 self.filename.close()
settings.py
1 BOT_NAME = 'ithome' 2 3 SPIDER_MODULES = ['ithome.spiders'] 4 NEWSPIDER_MODULE = 'ithome.spiders' 5 6 7 8 # Obey robots.txt rules 9 ROBOTSTXT_OBEY = False10 11 ITEM_PIPELINES = {12 'ithome.pipelines.IthomePipeline': 300,13 }
执行
scrapy crawl it