javascript - Exclude children of the first span tag inside div -


i building simple web scraper. website scraping - http://www.home.com/pro/c/oho,-ni. scraper clicks on every link class name of pro-title , extracts data web page enters (eg: http://www.me.com/pro/home/marcelle-services)

var casper = require('casper').create({     loglevel:"verbose",     debug:true });  var jsonobj = {}; var links; var name; var paragraph; var contact; var description; var location; var expression = /[-a-za-z0-9@:%_\+.~#?&//=]{2,256}\.[a-z]{2,4}\b(\/[-a-za-z0-9@:%_\+.~#?&//=]*)?/gi; var regex = new regexp(expression);  casper.start('http://www.home.com/ro/c/oho,-tn');  casper.then(function getlinks(){      links = this.evaluate(function(){         var links = document.getelementsbyclassname('pro-title');         links = array.prototype.map.call(links,function(link){             return link.getattribute('href');         });         return links;     }); }); casper.then(function(){     this.each(links,function(self,link){       if (link.match(regex)) {         self.thenopen(link,function(a){           var location = this.fetchtext('div.info-list-text');           //var location = document.queryselectorall("div.info-list-text")[1];           var contact = this.fetchtext('span.pro-contact-text');           var description = this.fetchtext('div.profile-about div');           this.echo(location);           //this.echo(contact);           //this.echo(description);         });       }     }); }); casper.run(function(){     this.exit(); }); 

the above code produces output,

                                       professionals                                  interior decorators                  contact: guilbeaulocation: 5007 wyoming ave.nowoah, mi 45786 

i want omit div.info-list-text span:first selection word professionals not logged.

might include jquery make life easier when selecting elements. 1 solution this:

var casper = require('casper').create({    loglevel:"verbose",    debug:true,     clientscripts:  ['jquery.js'] });  var jsonobj = {}; var links; var name; var paragraph; var contact; var description; var location; var expression = /[-a-za-z0-9@:%_\+.~#?&//=]{2,256}\.[a-z]{2,4}\b(\/[-a-za-z0-9@:%_\+.~#?&//=]*)?/gi; var regex = new regexp(expression);  casper.start('http://www.houzz.com/professionals/c/nashville,-tn');  casper.then(function getlinks(){  links = this.evaluate(function(){     var links = document.getelementsbyclassname('pro-title');     links = array.prototype.map.call(links,function(link){         return link.getattribute('href');     });     return links; }); });  casper.then(function(){  this.each(links,function(self,link){   if (link.match(regex)) {     self.thenopen(link,function(a){         // manually extracted stuff wanted jquery selectors        var txtyouwant = casper.evaluate(function() {           var desiredtext = $($("div.info-list-text").first().find("span span")[1]).text();           desiredtext += $($("div.info-list-text")[1]).text();           desiredtext += $($("div.info-list-text")[2]).text();           return desiredtxt;        });     });   } }); }); 

edit:

make sure fix part:

var casper = require('casper').create({    loglevel:"verbose",    debug:true,    clientscripts:  ['jquery.js'] }); 

Comments

Popular posts from this blog

sublimetext3 - what keyboard shortcut is to comment/uncomment for this script tag in sublime -

java - No use of nillable="0" in SOAP Webservice -

ubuntu - Laravel 5.2 quickstart guide gives Not Found Error -