Python Script to download Splunk Documentation PDFs Posted: 2012-11-20 16:33 #!/usr/bin/env python ''' Script to download all PDF files for a particular version of Splunk documentation. Requirements: requests: http://docs.python-requests.org beautifulsoup4: http://www.crummy.com/software/BeautifulSoup/ (pip install -U requests beautifulsoup4) Author: Michael Rabbitt (https://github.com/mrabbitt) ''' from __future__ import unicode_literals, print_function import os.path import shutil import re import sys import tempfile import requests from bs4 import BeautifulSoup manual_link_regex = re.compile(r'^/Documentation/Splunk/([\d.]+)/([^/]+)/(.+)$') filename_regex = re.compile(r'filename="([^"]+)"') def getDownloadFileName(response, default_name): '''Determines file name from a response object based on its headers Rerverts to `default_name` if no name is suggested by the response headers.''' if response.headers.has_key('content-disposition'): match = filename_regex.match(response.headers['content-disposition']) if match: return match.group(1) return default_name def main(download_directory, target_version): response = requests.get('http://docs.splunk.com/Documentation/Splunk/{0}'.format(target_version)) page = BeautifulSoup(response.text) manual_links = [div.find('a') for div in page.find_all('div', class_='manualmodule')] for manual_link in manual_links: match = manual_link_regex.match(manual_link.attrs['href']) doc_description = manual_link.text.strip() (version, section, docname) = match.groups() pdf_url = 'http://docs.splunk.com/index.php?title=Documentation:Splunk:{0}:{1}:{2}&action=pdfbook'.format(section, docname, version) print('Downloading "{0}" from <{1}>...'.format(doc_description, pdf_url)) pdf_response = requests.get(pdf_url) file_name = getDownloadFileName(pdf_response, 'Splunk-{0}-{1}.pdf'.format(version, section)) with tempfile.NamedTemporaryFile(suffix=file_name, delete=False) as temp_file: temp_file.write(pdf_response.content) target_path = os.path.join(download_directory, file_name) print('Moving temporary file to {0}'.format(target_path)) shutil.move(temp_file.name, target_path) print('Complete') if __name__ == '__main__': if len(sys.argv) == 3: download_directory = sys.argv[1] target_version = sys.argv[2] if len(sys.argv) == 3 else 'latest' if os.path.isdir(download_directory): main(download_directory, target_version) else: print('No such directory: {0}'.format(download_directory), file=sys.stderr) else: print('''Usage: {0} PATH_TO_DOWNLOAD_DIR VERSION'''.format(os.path.basename(__file__)), file=sys.stderr)