需求:爬取教程中的题目以及作者
url : http://www.wxapp-union.com/portal.php?mod=list&catid=2&page=1
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
| import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule
class CxcSpider(CrawlSpider): name = 'cxc' allowed_domains = ['wxapp-union.com'] start_urls = ['http://www.wxapp-union.com/portal.php?mod=list&catid=2&page=1']
rules = ( Rule(LinkExtractor(allow=r'http://www.wxapp-union.com/portal.php\?mod=list&catid=2&page=\d+'), follow=True), Rule(LinkExtractor(allow=r'http://www.wxapp-union.com/article-\d+-1.html'),callback='parse_item'), )
def parse_item(self, response): item = {} title = response.xpath('//*[@id="ct"]/div[1]/div/div[1]/div/div[2]/div[1]/h1/text()').extract_first() author = response.xpath('//*[@id="ct"]/div[1]/div/div[1]/div/div[2]/div[3]/div[1]/p/a/text()').extract_first() item['title'] = title item['author'] = author print(item) return item
|