im trying to scrap some info from “El Peruano” journal, but i cannot at first sight it look have to:
- Put a Date in a Formbox.
- Do a click in SearchBox.
- Get all links for get all: “Title”,”Resolution#”, “Body”
This is my code:
import scrapy
class SpiderPeruano(scrapy.Spider):
name = "peruano"
start_urls = [
"https://diariooficial.elperuano.pe/Normas"
]
custom_settings= {
"FEED_URI": "peruano.json",
"FEED_FORMAT": "json",
"FEED_EXPORT_ENCODING": "utf-8"
}
def parse_click(self, response):
#i put here a condition but i think is not necessary
#button = response.xpath("//div[@id='busqueda']/form[@action]/button[@id='btnBuscar']").get()
#if buttom:
yield scrapy.FormRequest.from_response(
response,
formxpath= "//form[@id='space_PortalNormasLegalesN']",
formdata={"cddesde": "08/03/2022", "cdhasta:": "08/03/2022"},
dont_click=True,
dont_filter=True,
callback=self.parse
)
def parse(self, response):
links = response.xpath("//div[@class='ediciones_texto']/h5/a/@href").getall()
for link in links:
yield response.follow(link, callback=self.parse_link)
def parse_link(self, response):
title = response.xpath("//div[@class='story']/h1[@class='sumilla']/text()").get()
num = response.xpath("//div[@class='story']/h2[@class='resoluci-n']/text()").getall()
body = response.xpath("//div[@class='story']/p/text()").getall()
yield {
"title": title,
"num": num,
"body": body
}
#call
#scrapy crawl peruano
#url = "https://diariooficial.elperuano.pe/normas"
#Form_BOX: "//form[@action]"
#Box_desde = "//form[@action]/input[@id='cddesde']"
#Box_hasta = "//form[@action]/input[@id='cdhasta']"
#Button= "//div[@id='busqueda']/form[@action]/button[@id='btnBuscar']"
#links = "//div[@class='ediciones_texto']/h5/a/@href"
#titles= "//div[@class='story']/h1[@class='sumilla']/text()"
#resolutionNum= "//div[@class='story']/h2[@class='resoluci-n']/text()"
#body= "//div[@class='story']/p/text()"
So, i need some help for know what i’m doing wrong on my code cuz this run well but dont get the data.
Thx a lot for your time and help!