I am trying to learn web scraping through chrome extensions. I scrape some hard to scrape things and I have built some custom scraping extensions. This is the first one where I am trying to loop, and automatically open and hopefully close tabs in a loop.
The problem I have seems to have to do with active = false windows. When I use active = true windows I get presumably all the AJAX loading of the page however I don’t get the full data when I do it with false windows. It is like the AJAX loading doesn’t occur.
I believe my code fires when the page is fully loaded.
I scrape these pages normally by simply loading the page active and sending a message from the content to a textarea using the same code as in payload.js. Then I finish parsing in PHP ;(
I used this answer in my solution. Open tab, wait for it to finish loading then close it
Does anyone know why I get different results using the same content script (payload.js) for active and inactive windows?
{
"manifest_version": 2,
"name": "Loop",
"description": "A simple Looping page-scraping extension.",
"version": "1.0",
"author": "willrich33",
"background": {
"scripts": ["popup.js"],
"persistent": true
},
"permissions": [
"tabs",
"http://*/",
"https://*/",
"storage"
],
"content_scripts": [{
"matches": ["<all_urls>"],
"all_frames": true,
"js": ["jquery-3.6.0.min.js"],
"run_at":"document_start"
}],
"browser_action": {
"default_icon": "logo.png",
"default_popup": "popup.html"
}
}
popup.js
//Simple button on the popup
document.addEventListener('DOMContentLoaded', function () {
var btn = document.getElementById('myOpen');
btn.addEventListener('click', function() {
var arr = document.getElementById('pagearea').value.split("n");
var urls = arr.map(element => {
return element.trim();
});
document.getElementById('pagearea').value = "";
create_tab(urls);
});
});
//Javascript Promise timer
const timer = ms => new Promise(res => setTimeout(res, ms));
// We need to wrap the loop into an async function for this to work
async function create_tab (urls) {
//loop through urls
for (let i = 0; i < urls.length; i++) {
chrome.tabs.create({url: urls[i], selected: false }, myTab => {
function listener(tabId, changeInfo, tab) {
// make sure the status is 'complete' and it's the right tab
if (tabId === myTab.id && changeInfo.status == 'complete') {
chrome.extension.getBackgroundPage().chrome.tabs.executeScript(
tabId,
{ file: 'payload.js' },
);
chrome.tabs.onUpdated.removeListener(listener);
}
}
chrome.tabs.onUpdated.addListener(listener);
});
await timer(10000); // then the created Promise can be awaited
}
}
// Listen to messages from the payload.js script and write to popout.html
chrome.runtime.onMessage.addListener(function (message) {
document.getElementById('pagearea').value = message;
});
payload.js
chrome.runtime.sendMessage(document.body.outerHTML);
popup.html
<!doctype html>
<html>
<head>
<title>Loop</title>
<script src="popup.js"></script>
</head>
<body>
<h3>Loop</h1>
<p id='pagetitle'></p>
<textarea id='pagearea' cols="40" rows="6"></textarea><br>
<button id="myOpen">Open Page</button>
</body>
</html>