beeware.github.io/scripts/check_links.py
Gonzalo Pena-Castellanos 07ac9239a1 Fix links
2017-11-10 23:13:21 -05:00

271 lines
7.9 KiB
Python

# -*- coding: utf-8 -*-
"""
Check status of links from anchors and images on the generated site.
mailto: links are currently ignored
"""
# Standard library imports
from __future__ import print_function
import datetime
import os
import shutil
import subprocess
import sys
import tempfile
# Third party imports
from bs4 import BeautifulSoup
import requests
try:
import urlparse as parse
except ImportError:
import urllib.parse as parse
HERE = os.path.abspath(os.path.dirname(__file__))
REPO = os.path.dirname(HERE)
TIMEOUT = 10 # Seconds
TIMEOUT_RETRIES = 3 # Amount of retries on link check when ReadTimeOut error
WHITELIST = [
'https://github.com/pybee/pybee.github.io/edit/lektor/content/',
'https://github.com/pybee/pybee.github.io/new/lektor/content/',
'https://github.com/issues?q=user%3Apybee+label%3Afirst-timers-only+is%3Aopen&type=Issues',
'https://github.com/issues?q=user%3Apybee+label%3Aup-for-grabs+is%3Aopen&type=Issues',
]
def normalize_url(url, root_url):
"""Normalize a url based on a site root url"""
if url.startswith(('http')):
norm_link = url
elif url.startswith('mailto'):
norm_link = None
else:
norm_link = parse.urljoin(root_url, url)
return norm_link
def get_files(root_path, extensions=('.html',)):
"""Return all files located in root_path that match extensions."""
all_paths = []
for dirname, subdirlist, filelist in os.walk(root_path):
for fname in filelist:
if fname.endswith(extensions):
print('.', end='')
fpath = os.path.join(dirname, fname)
all_paths.append(fpath)
all_paths = list(sorted(all_paths))
return all_paths
def parse_urls(fpath, root_url):
"""Parse html file using BS4 and find links for anchors and images."""
urls = set()
if os.path.isfile(fpath):
with open(fpath, 'r') as f:
html_content = f.read()
soup = BeautifulSoup(html_content, 'html.parser')
# Find anchors
for tag in soup.find_all('a', href=True):
url = tag['href']
norm_url = normalize_url(url, root_url)
if norm_url:
urls.add(norm_url)
# Find images
for tag in soup.find_all('img', src=True):
url = tag['src']
norm_url = normalize_url(url, root_url)
if norm_url:
urls.add(norm_url)
return urls
def check_link(url, root_url, root_path):
""""""
error = None
status = 0
content = ''
headers = headers = {
'User-Agent': ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/39.0.2171.95 Safari/537.36')}
try:
if url.startswith(root_url):
# Check if actual file path exists, otherwise try to use requests
sep = os.sep*2 if os.name == 'nt' else os.sep
url_path = url.replace(root_url[:-1], root_path).replace('/', sep)
# print(url_path, os.path.isdir(url_path))
if os.path.isdir(url_path) or os.path.isfile(url_path):
status = 200
else:
r = requests.get(url, headers=headers, timeout=TIMEOUT)
content = r.content
status = r.status_code
else:
r = requests.head(url, headers=headers, timeout=TIMEOUT)
status = r.status_code
except Exception as e:
error = True
status = type(e).__name__
if ("Brutus can't find what you're looking for" in content and
not url.endswith('404.html')):
res = 404
elif status not in [200, 301, 302] or error:
res = status or error
else:
res = None
if res:
res = str(res)
return res
def execute(cmd):
"""Execute cmd with popen and yield the output form stdout."""
popen = subprocess.Popen(cmd, stdout=subprocess.PIPE,
stderr=subprocess.PIPE, universal_newlines=True)
for stdout_line in iter(popen.stdout.readline, ""):
# Yield process and stdout output, that way the process can be killed
yield popen, stdout_line
popen.stdout.close()
return_code = popen.wait()
if return_code:
raise subprocess.CalledProcessError(return_code, cmd)
def run_link_checks(root_path, root_url):
"""Run checks on links found for anchors and images."""
print('\nFinding built files:\n')
file_paths = get_files(root_path)
file_links = {}
print('\n\nParsing built files for links:\n')
for i, fpath in enumerate(file_paths):
# print('\n')
# print(i, fpath)
print('.', end='')
file_links[fpath] = set()
links = parse_urls(fpath, root_url)
for link in links:
# print('.', end='')
file_links[fpath].add(link)
link_files = {}
for fpath, links in file_links.items():
for link in links:
if link not in link_files:
link_files[link] = set()
link_files[link].add(fpath)
issues = len(link_files)
plural = 's' if issues != 1 else ''
print('\n\nFound {0} unique link{1}!\n'.format(issues, plural))
print('\nChecking links\n'.format(issues, plural))
counter = 0
for i, link in enumerate(sorted(link_files)):
# Ignore `edit content on github` links as they would take too long
if any(link.startswith(i) for i in WHITELIST):
continue
# sys.stdout.write("{0}\r".format(i))
# sys.stdout.flush()
fpaths = link_files[link]
for j in range(TIMEOUT_RETRIES):
check = check_link(link, root_url, root_path)
if check:
if 'ReadTimeout' in check:
print('\nRetrying...\n')
else:
break
else:
break
if check:
print('\n\nError number: {0}'.format(counter))
print('Status code or error type: {0}'.format(check))
print('{0}'.format(link))
print('-'*len(link))
for fpath in sorted(fpaths):
clean_fpath = fpath.replace(root_path, '')
clean_url = root_url[:-1] + os.path.dirname(clean_fpath)
print('\t' + clean_url)
counter += 1
print('\n')
else:
print('.', end='')
return counter
def main():
"""Run main script."""
start_time = datetime.datetime.now()
temp_build_path = tempfile.mkdtemp()
port = '5000'
root_url = 'http://127.0.0.1:{0}/'.format(port)
# Remove build folder if it exists
if os.path.isdir(temp_build_path):
shutil.rmtree(temp_build_path, ignore_errors=True)
cmd = ['lektor', 'server', '--no-prune', '--port', port, '--output-path',
temp_build_path]
print('\nRunning Lektor Server:')
print('\t$ ' + ' '.join(cmd) + '\n')
count = 0
finished_build = False
print('\nBuilding static site:\n')
for p, stdout in execute(cmd):
if stdout and not finished_build:
print('.', end='')
if 'Finished build' in stdout:
finished_build = True
print('\n\nFinished building site!\n\n')
count = run_link_checks(temp_build_path, root_url)
# Kill the server
p.kill()
break
# Remove build folder
if os.path.isdir(temp_build_path):
shutil.rmtree(temp_build_path, ignore_errors=True)
total_seconds = (datetime.datetime.now() - start_time).total_seconds()
# Fail if problems found, otherwise do a normal exit
if count:
if count != 1:
print('\n\n{} links are broken!'.format(count))
else:
print('\n\n{} link is broken!'.format(count))
exit_code = 1
else:
print('\n\nAll links found are working fine!')
exit_code = 0
print('\nProcess took {} seconds\n'.format(total_seconds))
sys.exit(exit_code)
if __name__ == '__main__':
main()