python - How to download from JavaScript rendered webpage? -
how can download links on javascript rendered webpage? python preferred language.
so far, i've tried using python bindings selenium on headless server. approach terribly slow, fraught error, , incapable of reliably determining download progress or success. additionally, headless server interferes clipboard (which problem). used firefox can configured download default directory, don't think chrome situation better.
alternatively, i've tried using webkit.
def render(url): """fully render webpage (javascript , all) , return html.""" import subprocess textwrap import dedent script = dedent("""\ import sys pyqt4.qtcore import qurl pyqt4.qtgui import qapplication pyqt4.qtwebkit import qwebpage class render(qwebpage): def __init__(self, url): self.app = qapplication(sys.argv) qwebpage.__init__(self) self.loadfinished.connect(self._loadfinished) self.mainframe().load(qurl(url)) self.app.exec_() def _loadfinished(self, result): self.frame = self.mainframe() self.app.quit() render = render(sys.argv[1]) print render.frame.tohtml().toascii()""").encode() process = subprocess.popen(['python2', '-', url], stderr=subprocess.pipe, stdin=subprocess.pipe, stdout=subprocess.pipe) # pipe script python's stdin return process.communicate(script)[0].decode('latin1')
this great if not fact need download in same session. there way preserve session used render page? pyqt4 , webkit bunch of shared libraries. i'm not sure how tear guts of them or whether such thing possible.
right i'm doing following:
with requests.session() session: html = session.get(url).text link = get_url(html) download(link, session=session)
without getting details, get_url(html, url)
extracts javascript page, hacks away calls dom, executes in node
. nasty stuff...
any way can safely render webpage , keep session?
i'm open doing in node if python not appropriate or javascript alternative more elegant. looks perhaps node-dom might suffice? i'm not familiar enough tell i'm interested in suggestions.
pyqt5 in python 2 or 3 trick in case. note function overly complex support earlier versions of pyqt5 use webkit later versions use webengine.
import sys def render(source_html): """return rendered html.""" try: pyqt5.qtcore import qeventloop pyqt5.qtwebenginewidgets import qwebengineview pyqt5.qtwidgets import qapplication class render(qwebengineview): """render html pyqt5 webengine.""" def __init__(self, html): self.html = none self.app = qapplication(sys.argv) qwebengineview.__init__(self) self.loadfinished.connect(self._loadfinished) self.sethtml(html) while self.html none: self.app.processevents( qeventloop.excludeuserinputevents | qeventloop.excludesocketnotifiers | qeventloop.waitformoreevents) self.app.quit() def _callable(self, data): self.html = data def _loadfinished(self, result): self.page().tohtml(self._callable) except importerror: pyqt5.qtwebkitwidgets import qwebpage pyqt5.qtwidgets import qapplication class render(qwebpage): """render html pyqt5 webkit.""" def __init__(self, html): self.html = none self.app = qapplication(sys.argv) qwebpage.__init__(self) self.loadfinished.connect(self._loadfinished) self.mainframe().sethtml(html) self.app.exec_() def _loadfinished(self, result): self.html = self.mainframe().tohtml() self.app.quit() return render(source_html).html
or pyqt4 in python 2.
import sys pyqt4.qtgui import qapplication pyqt4.qtwebkit import qwebpage class render(qwebpage): """fully render html, javascript , all.""" def __init__(self, html): self.app = qapplication(sys.argv) qwebpage.__init__(self) self.loadfinished.connect(self._loadfinished) self.mainframe().sethtml(html) self.app.exec_() def _loadfinished(self, result): self.frame = self.mainframe() self.app.quit() render = render(html) result = str(render.frame.tohtml().toascii())
Comments
Post a Comment