javascript - Exclude children of the first span tag inside div -
i building simple web scraper. website scraping - http://www.home.com/pro/c/oho,-ni. scraper clicks on every link class name of pro-title
, extracts data web page enters (eg: http://www.me.com/pro/home/marcelle-services)
var casper = require('casper').create({ loglevel:"verbose", debug:true }); var jsonobj = {}; var links; var name; var paragraph; var contact; var description; var location; var expression = /[-a-za-z0-9@:%_\+.~#?&//=]{2,256}\.[a-z]{2,4}\b(\/[-a-za-z0-9@:%_\+.~#?&//=]*)?/gi; var regex = new regexp(expression); casper.start('http://www.home.com/ro/c/oho,-tn'); casper.then(function getlinks(){ links = this.evaluate(function(){ var links = document.getelementsbyclassname('pro-title'); links = array.prototype.map.call(links,function(link){ return link.getattribute('href'); }); return links; }); }); casper.then(function(){ this.each(links,function(self,link){ if (link.match(regex)) { self.thenopen(link,function(a){ var location = this.fetchtext('div.info-list-text'); //var location = document.queryselectorall("div.info-list-text")[1]; var contact = this.fetchtext('span.pro-contact-text'); var description = this.fetchtext('div.profile-about div'); this.echo(location); //this.echo(contact); //this.echo(description); }); } }); }); casper.run(function(){ this.exit(); });
the above code produces output,
professionals interior decorators contact: guilbeaulocation: 5007 wyoming ave.nowoah, mi 45786
i want omit div.info-list-text span:first
selection word professionals
not logged.
might include jquery make life easier when selecting elements. 1 solution this:
var casper = require('casper').create({ loglevel:"verbose", debug:true, clientscripts: ['jquery.js'] }); var jsonobj = {}; var links; var name; var paragraph; var contact; var description; var location; var expression = /[-a-za-z0-9@:%_\+.~#?&//=]{2,256}\.[a-z]{2,4}\b(\/[-a-za-z0-9@:%_\+.~#?&//=]*)?/gi; var regex = new regexp(expression); casper.start('http://www.houzz.com/professionals/c/nashville,-tn'); casper.then(function getlinks(){ links = this.evaluate(function(){ var links = document.getelementsbyclassname('pro-title'); links = array.prototype.map.call(links,function(link){ return link.getattribute('href'); }); return links; }); }); casper.then(function(){ this.each(links,function(self,link){ if (link.match(regex)) { self.thenopen(link,function(a){ // manually extracted stuff wanted jquery selectors var txtyouwant = casper.evaluate(function() { var desiredtext = $($("div.info-list-text").first().find("span span")[1]).text(); desiredtext += $($("div.info-list-text")[1]).text(); desiredtext += $($("div.info-list-text")[2]).text(); return desiredtxt; }); }); } }); });
edit:
make sure fix part:
var casper = require('casper').create({ loglevel:"verbose", debug:true, clientscripts: ['jquery.js'] });
Comments
Post a Comment