I wrote a web spider that I would like to download and parse pages concurrently. Here is what I am trying to achieve:
- instantiate a new instance of the class with the $startURL as the constructor
- spider $startURL with the public concurrentSpider() function
- foreach of the links found at that that URL, fork php, and in the child instantiate a new object for each of those links and spider it as well, concurrently.
If the $startURL has 3 links, for example, I'm expecting for 3 processes to run simultaneously to retrieve the links from those pages. If each of those pages has 3 links, then I'm expecting 9 to then run simultaneously.
Here is the code. There is only one public function, and the foreeach in the public concurrentSpider($url)
function is where I am having problems, but I have included in entire class to be thorough.
class ConcurrentSpider {
private $startURL;
const DELAY = 1;
const SLEEPTIME = 1;
const ALLOW_OFFSITE = FALSE;
private $maxChildren = 1;
private $children = array();
function __construct($url) {
// this is important if ALLOW_OFFSITE is FALSE
// so that we have something to compare the url too.
$this->startURL = $url;
}
public function concurrentSpider($url) {
// STEP 1:
// Download the $url
$pageData = http_get($url, $ref = '');
if (!$this->checkIfSaved($url)) {
$this->save_link_to_db($url, $pageData);
}
//print_r($pageData);
sleep(self::SLEEPTIME);
// STEP 2:
// extract all links from this url's page data
$linksOnThisPage = $this->harvest_links($url, $pageData);
// STEP 3:
// Check the links array from STEP 2 to see if the pages have
// already been saved or is excluded because of any other
// logic from the excluded_link() function
$filteredLinks = $this->filterLinks($linksOnThisPage);
//print_r($filteredLinks);
// STEP 4: loop through each of the links and
// repeat the process
foreach ($filteredLinks as $filteredLink) {
$pid = pcntl_fork();
switch ($pid) {
case -1:
print "Could not fork!\n";
exit(1);
case 0:
print "In child with PID: " . getmypid() . " processing $filteredLink \n";
//$this->concurrentSpider($filteredLink);
// the above didn't work. let's try creating a new object
// child unique variable based on $pid
$var[$pid] = new ConcurrentSpider($this->startURL);
$var[$pid]->concurrentSpider($filteredLink);
sleep(2);
exit(1);
default:
// print "$pid In the parent\n";
// Add an element to the children array
$this->children[$pid] = $pid;
// If the maximum number of children has been
// achieved, wait until one or more return
// before continuing.
while (count($this->children) >= $this->maxChildren) {
$pid = pcntl_waitpid(0, $status);
unset($this->children[$pid]);
}
}
}
}
/**
* extract URLs from a web page.
*
* @param type $pageData
* @return type array $links on success, false on failure
*/
private function harvest_links($url, $pageData) {
$link_array = array();
//get page base for $url
$page_base = ResolveAddresses::get_base_page_address($url);
$anchor_tags = parse_array($pageData['FILE'], '<a', '</a>', EXCL);
//Put http attributes for each tag in array
for ($xx = 0; $xx < count($anchor_tags); $xx++) {
$href = get_attribute($anchor_tags[$xx], "href");
$resolved_address = ResolveAddresses::resolve_address($href, $page_base);
$link_array[] = $resolved_address;
}
return $link_array;
}
/**
* Take an array of links and filter out
* the ones that are not needed based on the
* logic of the exclude_link() function.
*
* @param array $links
* @return array
*/
private function filterLinks(array $links) {
$filteredLinks = array();
foreach ($links as $link) {
if (!$this->exclude_link($link, $filteredLinks)) {
$filteredLinks[] = $link;
}
}
print_r($filteredLinks);
return $filteredLinks;
}
private function exclude_link($link, array $currentArray) {
// TODO have this read from a file
$exclusion_array = array();
$exclude = FALSE;
if (in_array($link, $currentArray)) {
$exclude = true;
}
// Exclude links that are Javascript commands
if (stristr($link, "javascript")) {
echo "Ignored JavaScript fuction: $link\n";
$exclude = true;
}
// Exclude links that contain #
if (stristr($link, "#")) {
echo "Ignored # in $link\n";
$exclude = true;
}
// Exclude links found in $exclusion_array
for ($xx = 0; $xx < count($exclusion_array); $xx++) {
if (stristr($link, $exclusion_array[$xx])) {
echo "Ignored excluded link: $link\n";
$exclude = true;
}
}
// Exclude offsite links if requested
if (self::ALLOW_OFFSITE === FALSE) {
if ($this->get_domain($link) != $this->get_domain($this->startURL)) {
//print get_domain($link) . " ". get_domain($SEED_URL)."\n";
echo "Ignored offsite link: $link\n";
$exclude = true;
}
}
if ($exclude === FALSE) {
// print "Added new link: $link \n";
}
return $exclude;
}
/**
* Compare against $startURL to make sure we are on same domain name.
* @param type $url
* @return type
*/
private function get_domain($url) {
// Remove protocol from $url
$url = str_replace("http://", "", $url);
$url = str_replace("https://", "", $url);
// Remove page and directory references
if (stristr($url, "/"))
$url = substr($url, 0, strpos($url, "/"));
return $url;
}
private function checkIfSaved($url) {
// Exclude redundant links
// Check the database to see if the link was already saved.
$sql = "select count(id) from url_data where url = '$url'";
$result = $conn->prepare($sql);
$result->execute();
$number_of_rows = $result->fetchColumn();
if ($number_of_rows > 0) {
//print" Link: $link already exists in database\n";
return true;
}
return FALSE;
}
private function save_link_to_db($link, array $downloaded_data) {
$sql = "insert into url_data values('', :raw_html, :stripped_html, :status, :error, :URL, 'N')";
$ps = $dbh->prepare($sql);
// TODO test for success PDO execute statement
$ps->execute(array(':raw_html' => preg_replace('/\s+/', ' ', strip_tags($downloaded_data['FILE'])),
// serialize the entire array if we want all status data. for now, just
// http return code is fine.
//':status' => serialize($downloaded_data['STATUS']),
':status' => $downloaded_data['STATUS']['http_code'],
':error' => $downloaded_data['ERROR'],
':stripped_html' => preg_replace('/\s+/', ' ', strip_tags($downloaded_data['FILE'])),
':URL' => $link));
}
}
Note the print statement in the child. This is a sample of that print out. $filteredLink is always the same link, which is element 0 of the filteredLinks array in the parent.
In child with PID: 4333 processing http://site.com/link.html
In child with PID: 4334 processing http://site.com/link.html
In child with PID: 4335 processing http://site.com/link.html
This seems to be an infinite loop.
In the child, however, if I comment out the instantiation and use of the object, like this:
//$var[$pid] = new ConcurrentSpider($this->startURL);
//$var[$pid]->concurrentSpider($filteredLink);
then the print statement prints correctly, and $filteredLink is link1.html, link2.html, link3.html, etc.
What about my logic of trying to instantiate and use a new object in the child is causing it to loop indefinitely?
1 Answer 1
It look like the first link on every page of the site is the same (e.g. the link on a logo) and the children processes are simply printing this (correctly) so it looks like incorrect behavior, but isn't.