I am trying to scrape all data from big table: https://registru.dispozitive.amdm.gov.md/ website, but I am having some issues.
- URL of website does not include the nr of page you are on.
- If u resfresh it moves to 1st page.
- You cannot go to certain page automatically.
- You should ignore the small table with aditional data from left.
- The navigation html on big table is the same as navigation html on small table
I tryied couple methods of scraping:
- Extensions from google chrome (3,4 idk) but they charge you money
- Python scraping script, but at some point for i dont know why it’s stoping and remains stuck.
So i would love to get help on scraping the data from all pages, and save it into Excell file.
Here is the Script I was working on:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
def get_total_pages(driver):
soup = BeautifulSoup(driver.page_source, "html.parser")
total_pages_element = soup.find("b", class_="dxp-lead dxp-summary")
if total_pages_element:
text = total_pages_element.get_text(strip=True)
match = re.search(r'Страница (d+) из (d+)', text)
if match:
total_pages = int(match.group(2))
return total_pages
return None
def is_big_table_navigation(soup):
page_indicator = soup.find("b", class_="dxp-lead dxp-summary")
if page_indicator:
text = page_indicator.get_text(strip=True)
match = re.search(r'Страница (d+) из (d+)', text)
if match:
return True
return False
def navigate_to_starting_page(driver, start_page, total_pages):
current_page = 1
from_start = start_page - 1
from_end = total_pages - start_page
if from_start <= from_end:
print(f"Navigating to page {start_page} from the start...")
while current_page < start_page:
try:
next_button = WebDriverWait(driver, 20).until(
EC.element_to_be_clickable((By.XPATH, "//a[contains(@onclick,'PBN')]"))
)
next_button.click()
current_page += 1
print(f"Moved to page {current_page}")
WebDriverWait(driver, 20).until(
EC.text_to_be_present_in_element((By.XPATH, "//b[@class='dxp-lead dxp-summary']"), f"Страница {current_page}")
)
time.sleep(2)
except Exception as e:
print(f"Error navigating to page {current_page}: {e}")
driver.save_screenshot(f"error_page_{current_page}.png")
time.sleep(5)
continue
else:
print(f"Navigating to page {start_page} from the end...")
try:
last_page_button = WebDriverWait(driver, 20).until(
EC.element_to_be_clickable((By.XPATH, f"//a[@onclick and text()='{total_pages}']"))
)
last_page_button.click()
current_page = total_pages
print(f"Moved to page {current_page}")
WebDriverWait(driver, 20).until(
EC.text_to_be_present_in_element((By.XPATH, "//b[@class='dxp-lead dxp-summary']"), f"Страница {current_page}")
)
time.sleep(2)
except Exception as e:
print(f"Error navigating to last page: {e}")
driver.save_screenshot("error_last_page.png")
return
while current_page > start_page:
try:
prev_button = WebDriverWait(driver, 20).until(
EC.element_to_be_clickable((By.XPATH, "//a[contains(@onclick,'PBP')]"))
)
prev_button.click()
current_page -= 1
print(f"Moved to page {current_page}")
WebDriverWait(driver, 20).until(
EC.text_to_be_present_in_element((By.XPATH, "//b[@class='dxp-lead dxp-summary']"), f"Страница {current_page}")
)
time.sleep(2)
except Exception as e:
print(f"Error navigating to page {current_page}: {e}")
driver.save_screenshot(f"error_page_{current_page}.png")
time.sleep(5)
continue
print(f"Arrived at the starting page {start_page}.")
def is_valid_row(row):
first_field = row.find("td").get_text(strip=True)
return bool(re.match(r"^DMd+$", first_field))
def is_small_table_with_buttons(table):
rows = table.find_all("tr")
if len(rows) < 5:
if table.find("div", class_="dxpLite_SoftOrange") or table.find("div", class_="dxgvPagerBottomPanel_SoftOrange"):
return True
return False
def scrape_table_data(start_page, end_page):
driver = None
all_data = []
max_columns = 0
try:
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.maximize_window()
driver.get("https://registru.dispozitive.amdm.gov.md/")
total_pages = get_total_pages(driver)
if total_pages is None:
print("Unable to determine total pages. Exiting.")
return
navigate_to_starting_page(driver, start_page, total_pages)
current_page = start_page
while current_page <= end_page:
print(f"Scraping page {current_page}...")
time.sleep(1)
soup = BeautifulSoup(driver.page_source, "html.parser")
if not is_big_table_navigation(soup):
print("Ignoring small table navigation.")
retry_count = 0
max_retries = 5
page_changed = False
while retry_count < max_retries and not page_changed:
try:
next_button = WebDriverWait(driver, 20).until(
EC.element_to_be_clickable((By.XPATH, "//a[contains(@onclick,'PBN')]"))
)
next_button.click()
current_page += 1
print(f"Moved to page {current_page}")
WebDriverWait(driver, 20).until(
EC.text_to_be_present_in_element((By.XPATH, "//b[@class='dxp-lead dxp-summary']"), f"Страница {current_page}")
)
page_changed = True
except Exception as e:
retry_count += 1
print(f"Error moving to next page (attempt {retry_count}/{max_retries}): {e}")
driver.save_screenshot(f"error_next_page_{current_page}.png")
time.sleep(5)
continue
if not page_changed:
print(f"Failed to move to next page after {max_retries} attempts. Exiting loop.")
break
continue
tables = soup.find_all("table", class_="dxgvTable_SoftOrange")
for table in tables:
rows = table.find_all("tr", id=lambda x: x and x.startswith("ctl00_ctl00_ASPxSplitter1_Content_ContentSplitter_MainContent_ASPxGridView4_DXDataRow"))
if len(rows) < 5 or not any(is_valid_row(row) for row in rows):
print("Skipping a small table.")
continue
for row in rows:
if is_valid_row(row):
columns = row.find_all("td")
data = [column.get_text(strip=True) for column in columns]
all_data.append(data)
if len(columns) > max_columns:
max_columns = len(columns)
if current_page == end_page:
print("Reached the end page.")
break
retry_count = 0
max_retries = 5
page_changed = False
while retry_count < max_retries and not page_changed:
try:
next_button = WebDriverWait(driver, 20).until(
EC.element_to_be_clickable((By.XPATH, "//a[contains(@onclick,'PBN')]"))
)
next_button.click()
current_page += 1
print(f"Moved to page {current_page}")
WebDriverWait(driver, 20).until(
EC.text_to_be_present_in_element((By.XPATH, "//b[@class='dxp-lead dxp-summary']"), f"Страница {current_page}")
)
page_changed = True
except Exception as e:
retry_count += 1
print(f"Error moving to next page (attempt {retry_count}/{max_retries}): {e}")
driver.save_screenshot(f"error_next_page_{current_page}.png")
time.sleep(5)
continue
if not page_changed:
print(f"Failed to move to next page after {max_retries} attempts. Exiting loop.")
break
except Exception as e:
print(f"An error occurred: {e}")
finally:
if driver:
driver.quit()
if all_data:
for i in range(len(all_data)):
if len(all_data[i]) < max_columns:
all_data[i].extend([None] * (max_columns - len(all_data[i])))
columns = [f"Column{i+1}" for i in range(max_columns)]
df = pd.DataFrame(all_data, columns=columns)
file_name = f"registru.dispozitive.page{start_page}-{end_page}.xlsx"
df.to_excel(file_name, index=False)
print(f"Data saved to {file_name}")
else:
print("No data scraped to save.")
def menu():
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.maximize_window()
driver.get("https://registru.dispozitive.amdm.gov.md/")
total_pages = get_total_pages(driver)
driver.quit()
if total_pages is None:
print("Unable to determine the total number of pages. Exiting.")
return
print(f"Total pages: {total_pages}")
start_page = 1
end_page = total_pages
while True:
print("n1. Scrap all pages")
print("2. Set starting page for Scrap")
print("3. Set ending page for Scrap")
print("4. Start Scraping")
print("5. Quit")
choice = input("Enter your choice (1-5): ")
if choice == "1":
start_page = 1
end_page = total_pages
print(f"Set to scrape all pages from {start_page} to {end_page}.")
elif choice == "2":
start_page = int(input(f"Enter the starting page (1-{total_pages}): "))
if start_page < 1 or start_page > total_pages:
print(f"Invalid starting page. Please enter a number between 1 and {total_pages}.")
else:
print(f"Starting page set to {start_page}.")
elif choice == "3":
end_page = int(input(f"Enter the ending page ({start_page}-{total_pages}): "))
if end_page < start_page or end_page > total_pages:
print(f"Invalid ending page. Please enter a number between {start_page} and {total_pages}.")
else:
print(f"Ending page set to {end_page}.")
elif choice == "4":
print(f"Scraping from page {start_page} to {end_page}.")
scrape_table_data(start_page, end_page)
elif choice == "5":
print("Quitting the program.")
break
else:
print("Invalid choice. Please try again.")
menu()
if __name__ == "__main__":
menu()