需求:爬取教程中的题目以及作者
url : http://www.wxapp-union.com/portal.php?mod=list&catid=2&page=1
| 12
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 
 | import scrapyfrom scrapy.linkextractors import LinkExtractor
 from scrapy.spiders import CrawlSpider, Rule
 
 
 class CxcSpider(CrawlSpider):
 name = 'cxc'
 allowed_domains = ['wxapp-union.com']
 start_urls = ['http://www.wxapp-union.com/portal.php?mod=list&catid=2&page=1']
 
 rules = (
 Rule(LinkExtractor(allow=r'http://www.wxapp-union.com/portal.php\?mod=list&catid=2&page=\d+'),  follow=True),
 Rule(LinkExtractor(allow=r'http://www.wxapp-union.com/article-\d+-1.html'),callback='parse_item'),
 )
 
 def parse_item(self, response):
 item = {}
 
 
 
 title = response.xpath('//*[@id="ct"]/div[1]/div/div[1]/div/div[2]/div[1]/h1/text()').extract_first()
 author = response.xpath('//*[@id="ct"]/div[1]/div/div[1]/div/div[2]/div[3]/div[1]/p/a/text()').extract_first()
 item['title'] = title
 item['author'] = author
 print(item)
 return item
 
 |