Goodmorning,
I have a question about data scraping using php.
For a travel agency I need to scrape two websites wherby my contractor wants information about the competetion. specific information the need is all the trips that offered/ prices and the attandance.
I also received a php script that I can use but it is a bit technical for me.
Is there someone that can help me to adjust the php script?
Thanks in advance,
Tried to adjust script but didn’t work unfortunatly.
<?php
namespace ScraperNRV;
use GuzzleHttpClient;
use PsrHttpMessageResponseInterface;
class Sawadee extends Scraper
{
private $client;
private $maxPages = 30;
/**
* @var Cookie
*/
private $cookies;
# protected $to =
/**
* TUI constructor.
* @param Cookie $cookies
*/
public function __construct(Cookie $cookies)
{
$this->cookies = $cookies;
}
public function scrape()
{
$this->fetchUris();
print_r($this->products);die;
$onlyFirst = true;
foreach ($this->products as $product) {
try {
// if ($onlyFirst) {
$this->updateProductInfo($product['url']);
$onlyFirst = false;
// }
} catch (Throwable $e) {
echo $e->getMessage() ."n";
}
}
$this->generateCsv();
$this->sendMail();
}
private $iniUrl = 'reizen/?_smstate=1$5_28_65';
private $searchPageUrl = 'reizen/?page=';
private function fetchUris()
{
$options = [];
$response = $this->getClient()->request("GET", $this->iniUrl, $options);
$this->cookies->update($response);
$this->parseSearchPage($response);
for ($p = 2; $p < $this->maxPages; $p++) {
try {
$response = $this->getClient()->request("GET", $this->searchPageUrl . $p, $this->getOptions(true));
if ($response->getStatusCode() !== 200) {
echo "Parsed page $p gives http code " . $response->getStatusCode() . "n";
break;
}
$this->cookies->update($response);
$this->parseSearchPage($response);
$wait = rand(1, 5);
echo "Parsed page $p of {$this->maxPages}, waiting $wait sec.n";
// sleep($wait);
} catch (Throwable $e) {
echo $e->getMessage() . "n" . $e->getTraceAsString() . "n";
}
}
}
private $initGridUrl = 'data/pricegrid/pricegridprices/?clearprices=1';
private $selectionUrl = 'data/pricegrid/selection/';
private $receiptUrl = 'data/pricegrid/priceselect/';
public function updateProductInfo(string $url)
{
$startTime = microtime(true);
echo "Get page $url - " . date("Y-m-d H:i:s") . "n";
$this->products[$url] = [
'url' => $url,
'airline' => "",
'prices' => [],
];
$response = $this->getClient()->request("GET", $url, $this->getOptions(true));
$this->cookies->update($response);
if ($response->getStatusCode() !== 200) {
echo "Parsed page $url gives http code " . $response->getStatusCode() . "n";
return;
}
$dom = new DOMDocument();
@$dom->loadHTML($response->getBody()->getContents());
$xpath = new DOMXPath($dom);
$html = $xpath->query("//html");
$entityId = $html->item(0)->getAttribute('data-tmd');
list($theme, $type, $id) = explode("_", $entityId);
$h1 = $xpath->query("//h1");
$this->products[$url]['name'] = trim($h1->item(0)->nodeValue);
$crumbs = $xpath->query("//ul[contains(@class,'crumbtrail')]/li");
$this->products[$url]['country'] = trim($crumbs->item(0)->nodeValue);
$response = $this->getClient()->request("POST", $this->initGridUrl, $this->getOptions(true, true, [
'masterentitytype' => $type,
'masterentityid' => $id,
'theme' => $theme,
'firstview' => 'true',
]));
$this->cookies->update($response);
if ($response->getStatusCode() !== 200) {
echo "Parsed page $url gives http code " . $response->getStatusCode() . "n";
return;
}
$this->parsePriceInfo($url, $entityId, $response->getBody()->getContents());
$proceed = true;
$direction = 'earlier';
$previousMessage = '';
while ($proceed) {
echo "Price page $url -> $directionn";
try {
$response = $this->getClient()->request("POST", $this->selectionUrl, $this->getOptions(true, true, [
'MoveRelativeDates' => $direction,
'Entity' => $entityId,
]));
} catch (Throwable $e) {
echo $e->getMessage() . "n";
$proceed = false;
continue;
}
$this->cookies->update($response);
if ($response->getStatusCode() !== 200) {
echo "Parsed page $url gives http code " . $response->getStatusCode() . "n";
$proceed = false;
continue;
}
$content = $response->getBody()->getContents();
if ($previousMessage == $content) {
if ($direction === 'earlier') {
$direction = 'later';
continue;
} else {
$proceed = false;
continue;
}
}
$previousMessage = $content;
$json = json_decode($content, true);
if (!($json['pricegrid'] ?? false)) {
if ($direction === 'earlier') {
$direction = 'later';
continue;
} else {
$proceed = false;
continue;
}
}
$this->parsePriceInfo($url, $entityId, $json['pricegrid']);
echo "Price page $url -> $direction donen";
}
echo "UpdateProductInfo - " . (microtime(true) - $startTime) . "n";
}
private function parsePriceInfo(string $url, string $entityId, string $content)
{
$startTime = microtime(true);
$dom = new DOMDocument();
@$dom->loadHTML($content);
$xpath = new DOMXpath($dom);
$toolTip = $xpath->query("//*[boolean(@data-tui-tooltip-element)]/..");
foreach ($toolTip as $details) {
$priceId = $details->getElementsByTagName('span')->item(0)->getAttribute('rev');
$price = $details->getElementsByTagName('span')->item(0)->nodeValue;
$date = "";
$occupation = "";
$duration = "";
$board = "";
$transport = "";
$departure = "";
foreach ($details->getElementsByTagName('tr') as $details) {
$fields = $details->getElementsByTagName('td');
$name = $fields->item(0)->nodeValue;
$value = $fields->item(1)->nodeValue;
if (strripos($name, 'bezet') !== false) {
$occupation = trim(str_ireplace('n', '', str_ireplace('volwassene', '', $value)));
} elseif (strripos($name, 'duur') !== false) {
$duration = (int) trim(str_ireplace('dgn', '', $value)) - 1;
} elseif (strripos($name, 'verzorging') !== false) {
$board = $value;
} elseif (strripos($name, 'vervoer') !== false) {
$transport = $value;
} elseif (strripos($name, 'datum') !== false) {
$date = $this->parseDate($value);
} elseif (strripos($name, 'vanaf') !== false) {
$departure = $value;
}
}
$datePrice = $this->products[$url]['prices'][$date] ?? [
'first' => true,
'date' => $date,
'occupation' => $occupation,
'duration' => $duration,
'board' => $board,
'transport' => $transport,
'departure' => $departure,
'price' => $price,
'outbound' => '',
'inbound' => '',
'pax' => '',
];
if ($price < $datePrice['price'] || $datePrice['first']) {
// $info = [];
// if (strtolower($transport) == 'vliegtuig') {
$info = $this->getReceipt($entityId, $priceId);
// }
$datePrice = [
'first' => false,
'date' => $date,
'occupation' => $occupation,
'duration' => $duration,
'board' => $board,
'transport' => $transport,
'departure' => $departure,
'price' => $price,
'outbound' => $info['outbound'] ?? '',
'inbound' => $info['inbound'] ?? '',
'pax' => $info['pax'] ?? '',
];
}
$this->products[$url]['prices'][$date] = $datePrice;
//error_log(print_r([$date, $occupation, $duration, $board, $transport, $departure, $price], true));
}
echo "parsePriceInfo - " . (microtime(true) - $startTime) . "n";
}
private function getReceipt(string $entityId, string $priceId): array
{
$startTime = microtime(true);
if (empty($priceId)) {
echo "Empty price id givenn";
return ['outbound' => "", 'inbound' => ""];
}
$response = $this->getClient()->request("POST", $this->receiptUrl, $this->getOptions(true, true, [
'PriceSelectionId' => $priceId,
'Entity' => $entityId,
]));
$this->cookies->update($response);
if ($response->getStatusCode() !== 200) {
echo "Parsed receipt for price id $priceId gives http code " . $response->getStatusCode() . "n";
return [];
}
$content = $response->getBody()->getContents();
$json = json_decode($content, true);
if (!($json['pricegrid'] ?? false)) {
return [];
}
$outbound = [];
$inbound = [];
$pax = "";
$dom = new DOMDocument();
@$dom->loadHTML($json['pricegrid']);
$xpath = new DomXPath($dom);
$classname = "dep-loc";
$nodes = $xpath->query("//*[contains(@class, '$classname')]");
if ($nodes->length == 2) {
$outbound[] = $nodes->item(0)->nodeValue;
$inbound[] = $nodes->item(1)->nodeValue;
}
$classname = "arr-loc";
$nodes = $xpath->query("//*[contains(@class, '$classname')]");
if ($nodes->length == 2) {
$outbound[] = $nodes->item(0)->nodeValue;
$inbound[] = $nodes->item(1)->nodeValue;
}
$classname = "trnsprt";
$nodes = $xpath->query("//*[contains(@class, '$classname')]");
if ($nodes->length == 2) {
$outbound[] = $nodes->item(0)->nodeValue;
$inbound[] = $nodes->item(1)->nodeValue;
}
$classname = "grp-cmpstn-cnt";
$nodes = $xpath->query("//*[contains(@class, '$classname')]/@data-totalpassengers");
if ($nodes->length == 1) {
$pax = $nodes->item(0)->nodeValue;
}
echo "getReceipt - " . (microtime(true) - $startTime) . "n";
return ['outbound' => implode("- ", $outbound), 'inbound' => implode("- ", $inbound), 'pax' => $pax];
}
private function parseSearchPage(ResponseInterface $response)
{
$dom = new DOMDocument();
@$dom->loadHTML($response->getBody()->getContents());
$xpath = new DOMXPath($dom);
$buttons = $xpath->query("//div[contains(@class,'pricelabel')]/a");
foreach ($buttons as $button) {
$this->products[$button->getAttribute('href')] = [
'url' => $button->getAttribute('href'),
'prices' => [],
];
}
}
private function getClient(): Client
{
if ($this->client === null) {
return new Client([
// Base URI is used with relative requests
'base_uri' => "https://www.tui.nl/",
$this->getOptions(),
]);
}
return $this->client;
}
private function getOptions(bool $addCookie = false, bool $ajax = false, array $params = []): array
{
$options = ['headers' => [
'User-Agent' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15',
'Host' => 'www.tui.nl',
'Accept-Encoding' => 'gzip, deflate, br',
'Connection' => 'keep-alive',
],
'decode_content' => true];
if ($addCookie) {
$options['headers']['Cookie'] = $this->cookies->toString();
}
if ($ajax) {
$options['headers']['X-TS-AJAX-Request'] = "true";
$options['headers']['X-Requested-With'] = "XMLHttpRequest";
}
if (!empty($params)) {
$options['form_params'] = $params;
}
return $options;
}
private $months = [
'januari' => '01',
'februari' => '02',
'maart' => '03',
'april' => '04',
'mei' => '05',
'juni' => '06',
'juli' => '07',
'augustus' => '08',
'september' => '09',
'oktober' => '10',
'november' => '11',
'december' => '12',
];
private function parseDate(string $date): string
{
$parts = explode(' ', $date);
return sprintf("%s-%s-%02d", $parts[3], $this->months[strtolower($parts[2])], $parts[1]);
}
}