https://medium.com/@lmeyer./get-an-error-free-e-commerce-web-site-using-sentry-b6061264efc8...
Webscraping d'un site avec password avec SCRAPY
distrispider.py
import scrapy
from scrapy.selector import Selector
from distrifil.items import DistrifilItem
from pprint import pprint
class DistrispiderSpider(scrapy.Spider):
name = 'distrispider'
allowed_domains = ['distrifil.com']
start_urls = ['https://distrifil.com/fr/login']
def parse(self,response):
data = {
'login' : 'login_a_mofifier',
'code_client' : 'code_a_mofifier',
'pass': 'password_a_mofifier'
}
yield scrapy.FormRequest.from_response(
response,
formdata=data,
callback=self.after_login,method="POST")
def after_login(self, response):
yield scrapy.Request(url="https://distrifil.com/fr/catalogue", callback=self.parse_cats)
def parse_cats(self, response):
onclicks = response.xpath('//div[@class="col-xs-6 col-sm-6 col-md-5columns vignette-categorie"]/@onclick').extract() or [None][0]
for onclick in onclicks:
urls = onclick.replace('document.location.href=', '')
new_urls = urls.replace("'", '')
yield scrapy.Request(url=new_urls, callback = self.parse_sous_cats)
def parse_sous_cats(self, response):
url_cats = response.xpath('//a[@class="lien-categorie"]/@href').extract() or [None][0]
for url_cat in url_cats:
yield scrapy.Request(url=url_cat, callback = self.parse_product)
def parse_product(self, response):
sel = Selector(response)
item = DistrifilItem()
item['product_name'] = sel.xpath('//div[@class="produit_designation"]/text()').extract() or [None][0]
item['product_reference'] = sel.xpath('//div[@class="panel-body"]/div/strong/text()').extract() or [None][0]
item['product_price'] = sel.xpath('//div[@class="col-xs-5 prix btn btn-default masque_tarif"]/text()').extract() or [None][0]
item['product_status'] = sel.xpath('//div[@class="not-sold-available"]/span[@class="not-sold-available-text"]/text()').extract() or [None][0]
item['product_url'] = sel.xpath('//img/@data-remote').extract() or [None][0]
#lightbox
#product_urls = sel.xpath('//img/@data-remote').extract() or [None][0]
#for product_url in product_urls:
#yield scrapy.Request(url=product_url, callback = self.parse_product_infos)
return item
Publié dans:
Web Scrapping
Laissez un commentaire