1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
| import scrapy import json from myscrapy.items import MyscrapyItem
class HrSpider(scrapy.Spider): name = 'hr' allowed_domains = ['tencent.com'] one_url = 'https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1606137466198&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&pageIndex={}&pageSize=10&language=zh-cn&area=cn' two_url = 'https://careers.tencent.com/tencentcareer/api/post/ByPostId? timestamp=1605963490669&postId={}&language=zh-cn' start_urls = [one_url.format(1)]
def parse(self, response):
for page in range(1,11): url = self.one_url.format(page)
yield scrapy.Request( url=url,callback=self.parse_one ) def parse_one(self,response): data = json.loads(response.text) item = {} item = MyscrapyItem() for job in data['Data']['Posts']: item['arae'] = job['LocationName'] item['tpye'] = job['RecruitPostName'] post_id = job['PostId'] detail_url = self.two_url.format(post_id)
yield scrapy.Request( url=detail_url, callback=self.parse_two, meta= {'item':item} ) def parse_two(self,response): item = response.meta.get('item') data = json.loads(response.text) item['Responsibility'] = data['Data']['Responsibility'] item['Requirement'] = data['Data']['Requirement'] print(item) yield item
|