I have created a script to scrape a website for a school project.
In links.csv there is a overview of all the trips that are offered.”https://www.sawadee.nl/groepsrondreizen/midden-en-zuid-amerika/costa-rica/crh-vakantie-costa-rica-groepsreis/vertrekdata-en-prijzen/”
In this case I have two websites that are interacting. In cell 04-02 there is a departure code that gets added to the following link to supply the website with the number of travellers ”https://www.sawadee.nl/group_compositions/?departure_code=”.
How can I link this data and persue the following steps to complete the module.
Help is appreciated.
I
`<?php
//Require the PhpSpreadsheet library
require('autoload.php');
use PhpOfficePhpSpreadsheetSpreadsheet;
use PhpOfficePhpSpreadsheetWriterXlsx;
// Read the CSV file
$csvFile = 'links.csv';
$data = array();
if (($handle = fopen($csvFile, 'r')) !== false) {
while (($row = fgetcsv($handle)) !== false) {
$data[] = $row;
}
fclose($handle);
}
$arr = array();
// Display the array
foreach ($data as $datas => $key) {
foreach ($key as $value) {
if ($value == "") {
} else {
$arr[] = $value;
}
}
}
// Iterate over each link in the CSV
foreach ($arr as $url) {
sleep(20);
// Get the HTML content from the URL
$html = file_get_contents($url);
// Create a new DOMDocument object
$dom = new DOMDocument();
// Configure libxml to handle HTML errors
libxml_use_internal_errors(true);
// Load the HTML content
$dom->loadHTML($html);
// Clear libxml error buffer
libxml_clear_errors();
// Find the table element with class "responsive-table"
$tableElements = $dom->getElementsByTagName('table');
foreach ($tableElements as $table) {
if ($table->getAttribute('class') === 'responsive-table') {
// Create an array to store the table data
$tableData = array();
// Find all table rows
$rows = $table->getElementsByTagName('tr');
// Process the header row to extract column names
$headerRow = $rows->item(0);
$headerCells = $headerRow->getElementsByTagName('th');
$columns = array();
foreach ($headerCells as $headerCell) {
$columns[] = trim($headerCell->nodeValue);
}
// Iterate over each row starting from the second row
for ($i = 1; $i < $rows->length; $i++) {
$rowData = array();
$row = $rows->item($i);
// Find all table cells in the row
$cells = $row->getElementsByTagName('td');
// Iterate over each cell and store the data
foreach ($cells as $j => $cell) {
$type = $cell->getAttribute('class');
switch ($type) {
case 'cell-01':
$rowData[$columns[$j]] = trim(strip_tags($cell->nodeValue));
break;
case 'cell-02':
$rowData[$columns[$j]] = trim(strip_tags($cell->nodeValue));
break;
case 'cell-20':
$rowData[$columns[$j]] = trim(strip_tags($cell->nodeValue));
break;
case 'cell-03':
$rowData[$columns[$j]] = trim(strip_tags($cell->nodeValue));
break;
case 'cell-04-2':
$groupCompositionUrl = $cell->getElementsByTagName('a')->item(0)->getAttribute('href');
$groupComposition = file_get_contents($groupCompositionUrl);
// Create a new DOMDocument object for group composition
$groupDom = new DOMDocument();
$groupDom->loadHTML($groupComposition);
$groupDom->preserveWhiteSpace = false;
// Extract the number of men and women participants
$menCount = 0;
$womenCount = 0;
$menList = $groupDom->getElementsByTagName('ul')->item(0);
if ($menList) {
$menItems = $menList->getElementsByTagName('li');
foreach ($menItems as $item) {
$count = intval($item->nodeValue);
if ($count >= 0) {
$menCount += $count;
}
}
}
$womenList = $groupDom->getElementsByTagName('ul')->item(1);
if ($womenList) {
$womenItems = $womenList->getElementsByTagName('li');
foreach ($womenItems as $item) {
$count = intval($item->nodeValue);
if ($count >= 0) {
$womenCount += $count;
}
}
}
$rowData[$columns[$j]] = 'Men: ' . $menCount . ', Women: ' . $womenCount;
break;
case 'cell-04':
$rowData[$columns[$j]] = trim(strip_tags($cell->nodeValue));
break;
case 'cell-05':
$rowData[$columns[$j]] = trim(strip_tags($cell->nodeValue));
break;
case 'cell-06':
$rowData[$columns[$j]] = trim(strip_tags($cell->nodeValue));
break;
}
}
// Add the row data to the table data array
$tableData[] = $rowData;
}
// Create a new Spreadsheet object
$spreadsheet = new Spreadsheet();
$sheet = $spreadsheet->getActiveSheet();
// Set the header row
foreach ($columns as $key => $column) {
$columnLetter = chr(65 + $key);
$sheet->setCellValue($columnLetter . '1', $column);
}
// Set the data rows
foreach ($tableData as $rowKey => $row) {
$rowNumber = $rowKey + 2;
foreach ($row as $columnKey => $cellValue) {
$columnLetter = chr(65 + (int)$columnKey);
$sheet->setCellValue($columnLetter . $rowNumber, $cellValue);
}
}
// Generate a unique filename based on the link
$filename = 'scraped_data_' . md5($url) . '.xlsx';
// Save the spreadsheet to an Excel file
$writer = new Xlsx($spreadsheet);
$writer->save($filename);
echo 'Data has been saved to ' . $filename . '<br>';
break; // Stop processing further tables
}
}
}
?>
`