I’m trying to scrape movie details (name, genre, etc.) from Paytm Movies Coimbatore using PHP’s DOMDocument and XPath. While my code fetches the HTML successfully, it fails to extract text content from movie cards.
{"@context":"http://schema.org","@type":"Movie","name":"Good Bad Ugly","url":"https://paytm.com/movies","genre":"action","image":"https://assetscdn1.paytm.com/images/cinema/Goof-e0906bf0-115c-11f0-ac8c-93d19273dc5b.jpg","inLanguage":"Tamil","duration":"PT140M","datePublished":"2025-04-10","releasedEvent":{"@context":"http://schema.org","@type":"PublicationEvent","startDate":"2025-04-10","location":{"@type":"Country","name":"IN"}},"aggregateRating":{"@context":"http://schema.org","@type":"AggregateRating","ratingValue":9.3,"bestRating":10,"ratingCount":2031}}93%Good Bad UglyUA16+TamilBook Ticket
<?php
header('Content-Type: text/html; charset=utf-8');
$url = "https://paytm.com/movies/coimbatore";
// Initialize cURL with headers
$ch = curl_init();
curl_setopt_array($ch, [
CURLOPT_URL => $url,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
CURLOPT_SSL_VERIFYPEER => false,
CURLOPT_TIMEOUT => 30,
CURLOPT_HTTPHEADER => [
'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language: en-US,en;q=0.5',
]
]);
$html = curl_exec($ch);
if (curl_errno($ch)) {
die('cURL error: ' . curl_error($ch));
}
curl_close($ch);
$dom = new DOMDocument();
libxml_use_internal_errors(true);
$dom->loadHTML('<?xml encoding="UTF-8">' . $html);
libxml_clear_errors();
$xpath = new DOMXPath($dom);
$movies = [];
// Updated selectors based on current Paytm structure
$movieCards = $xpath->query("//div[contains(@class, 'movie-card') or contains(@class, 'movie')]");
foreach ($movieCards as $card) {
// Check if the card is valid
echo '<pre>'; print_r($card->textContent); echo '</pre>';
}
?>
output data like
Array (
[0] => [
'title' => 'Kill bill',
'genre' => 'Action',
'url' => '/movies/kill bill'
]
)