diff options
author | Martin Mathieson <martin.mathieson@keysight.com> | 2020-07-10 23:07:18 +0100 |
---|---|---|
committer | Anders Broman <a.broman58@gmail.com> | 2020-07-12 05:05:08 +0000 |
commit | 70119bb905be7fbfbbfbe039ffdbc1ef7beeaa94 (patch) | |
tree | d4f25e1a814b0f7de0b0afe24f14f4dfe757a6ed /tools | |
parent | 415f72a146ae8acc3cd432a9f2168ea34a99e6bf (diff) |
check_dissector_urls.py: Add options to control which files to scan
The intention is to try to run this on the Petri-dish buildbot,
where it could run with '--commits 1' to warn about files touched
in the most recent commit.
Change-Id: Ie924d39e093d1fef8cfbdf02d15bbede386b2862
Reviewed-on: https://code.wireshark.org/review/37826
Petri-Dish: Martin Mathieson <martin.r.mathieson@googlemail.com>
Tested-by: Petri Dish Buildbot
Reviewed-by: Anders Broman <a.broman58@gmail.com>
Diffstat (limited to 'tools')
-rwxr-xr-x | tools/check_dissector_urls.py | 197 |
1 files changed, 131 insertions, 66 deletions
diff --git a/tools/check_dissector_urls.py b/tools/check_dissector_urls.py index b9679a0483..08166f7168 100755 --- a/tools/check_dissector_urls.py +++ b/tools/check_dissector_urls.py @@ -9,6 +9,9 @@ import os import re import requests import shutil +import subprocess +import argparse +import signal # This utility scans the dissector code for URLs, then attempts to # fetch the links. The results are shown in stdout, but also, at @@ -19,12 +22,23 @@ import shutil # TODO: -# - allow single dissector name to be given as a command-line arg. # - option to write back to dissector file when there is a failure? # - make requests in parallel (run takes around 35 minutes)? # - optionally parse previous successes.txt and avoid fetching them again? # - make sure URLs are really within comments in code? # - use urllib.parse or similar to better check URLs? +# - improve regex to allow '+' in URL (like confluence uses) + +# Try to exit soon after Ctrl-C is pressed. +should_exit = False + + +def signal_handler(sig, frame): + global should_exit + should_exit = True + print('You pressed Ctrl+C - exiting') + +signal.signal(signal.SIGINT, signal_handler) class FailedLookup: @@ -45,16 +59,6 @@ class FailedLookup: cached_lookups = {} -# These are strings typically seen after redirecting to a page that won't have -# What we are looking for. Usually get a 404 for these anyway. -# TODO: likely more of these... -apology_strings = ["sorry, we cannot find the page", - "this page could not be found", - "the page you're looking for can't be found", - "the content you are looking for cannot be found...", - "the resource you are looking for has been removed"] - - class Link(object): def __init__(self, file, line_number, url): @@ -67,25 +71,30 @@ class Link(object): self.result_from_cache = False def __str__(self): - s = (('SUCCESS ' if self.success else 'FAILED ') + self.file + ':' + str(self.line_number) + - ' ' + self.url + " status-code=" + str(self.r.status_code) + - ' content-type="' + (self.r.headers['content-type'] if ('content-type' in self.r.headers) else 'NONE') + '"') + epan_idx = self.file.find('epan') + if epan_idx == -1: + filename = self.file + else: + filename = self.file[epan_idx:] + s = ('SUCCESS ' if self.success else 'FAILED ') + \ + filename + ':' + str(self.line_number) + ' ' + self.url + if True: # self.r: + if self.r.status_code: + s += " status-code=" + str(self.r.status_code) + if 'content-type' in self.r.headers: + s += (' content-type="' + + self.r.headers['content-type'] + '"') + else: + s += ' <No response Received>' return s - def looksLikeApology(self): - content = str(self.r.content) - # N.B. invariably comes back as just one line... - if any(needle in content for needle in apology_strings): - print('Found apology!') - return True - return False - def validate(self, session): # Fetch, but first look in cache global cached_lookups self.tested = True if self.url in cached_lookups: - print('[Using cached result for', self.url, ']') + if args.verbose: + print('[Using cached result for', self.url, ']') self.r = cached_lookups[self.url] self.result_from_cache = True else: @@ -97,7 +106,8 @@ class Link(object): # Cache this result. cached_lookups[self.url] = self.r except (ValueError, ConnectionError, Exception): - print(self.url, ': failed to make request') + if args.verbose: + print(self.url, ': failed to make request') self.success = False # Add bad result to crashed_lookups. cached_lookups[self.url] = FailedLookup() @@ -109,49 +119,100 @@ class Link(object): self.success = False return - # Look for 'not really found' type strings in r.content - if self.looksLikeApology(): - print('Got body, but it looks like content has moved?') - self.success = False - return - # Assume its Ok. self.success = True -# Scan the given folder for links to test. +links = [] -def findLinks(folder): - links = [] - # Look at files in sorted order, to give some idea of how far through it - # is. - for filename in sorted(os.listdir(folder)): - if filename.endswith('.c'): - with open(os.path.join(folder, filename), 'r') as f: - for line_number, line in enumerate(f, start=1): - urls = re.findall( - r'https?://(?:[a-zA-Z0-9./_?&=-]+|%[0-9a-fA-F]{2})+', line) +def findLinksInFile(filename): + with open(filename, 'r') as f: + for line_number, line in enumerate(f, start=1): + # TODO: not matching + # https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol + urls = re.findall( + r'https?://(?:[a-zA-Z0-9./_?&=-]+|%[0-9a-fA-F]{2})+', line) - for url in urls: - # Lop off any trailing chars that are not part of it - url = url.rstrip(").',") + for url in urls: + # Lop off any trailing chars that are not part of it + url = url.rstrip(").',") - # A url must have a period somewhere - if '.' not in url: - continue + # A url must have a period somewhere + if '.' not in url: + continue + if args.verbose: + print('Found URL:', url) + global links + links.append(Link(filename, line_number, url)) - print('Found URL:', url) - links.append(Link(filename, line_number, url)) - print('Found', len(links), 'links') - return links + +# Scan the given folder for links to test. +def findLinksInFolder(folder): + # Look at files in sorted order, to give some idea of how far through it + # is. + for filename in sorted(os.listdir(folder)): + if filename.endswith('.c'): + global links + findLinksInFile(os.path.join(folder, filename)) ################################################################# # Main logic. -# Find links from dissector folder. -links = findLinks(os.path.join(os.path.dirname(__file__), '..', 'epan', 'dissectors')) +# command-line args. Controls which dissector files should be scanned. +# If no args given, will just scan epan/dissectors folder. +parser = argparse.ArgumentParser(description='Check URL links in dissectors') +parser.add_argument('--file', action='store', default='', + help='specify individual dissector file to test') +parser.add_argument('--commits', action='store', + help='last N commits to check') +parser.add_argument('--open', action='store_true', + help='check open files') +parser.add_argument('--verbose', action='store_true', + help='when enabled, show more output') + +args = parser.parse_args() + + + +def isDissectorFile(filename): + p = re.compile('epan/dissectors/packet-.*\.c') + return p.match(filename) + +# Get files from wherever command-line args indicate. +if args.file: + # Fetch links from single file. + findLinksInFile(args.file) +elif args.commits: + # Get files affected by specified number of commits. + command = ['git', 'diff', '--name-only', 'HEAD~' + args.commits] + files = [f.decode('utf-8') + for f in subprocess.check_output(command).splitlines()] + # Fetch links from files (dissectors files only) + for f in files: + if isDissectorFile(f): + findLinksInFile(f) +elif args.open: + # Unstaged changes. + command = ['git', 'diff', '--name-only'] + files = [f.decode('utf-8') + for f in subprocess.check_output(command).splitlines()] + # Staged changes. + command = ['git', 'diff', '--staged', '--name-only'] + files_staged = [f.decode('utf-8') + for f in subprocess.check_output(command).splitlines()] + for f in files: + if isDissectorFile(f): + findLinksInFile(f) + for f in files_staged: + if not f in files: + if isDissectorFile(f): + findLinksInFile(f) +else: + # Find links from dissector folder. + findLinksInFolder(os.path.join(os.path.dirname( + __file__), '..', 'epan', 'dissectors')) # Prepare one session for all requests. For args, see @@ -163,12 +224,14 @@ session.headers.update({'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) Apple 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'}) # Try out the links. -limit = 5000 # Control for debug for checks, link in enumerate(links): + if should_exit: + # i.e. if Ctrl-C has been pressed. + exit(0) link.validate(session) - print(link) - if checks > limit: - break + if args.verbose or not link.success: + print(link) + # Write failures to a file. Back up any previous first though. if os.path.exists('failures.txt'): @@ -184,16 +247,18 @@ with open('successes.txt', 'w') as f_s: f_s.write(str(l) + '\n') -# Show overall stats. +# Count and show overall stats. passed, failed, cached = 0, 0, 0 for l in links: - if l.tested and not l.result_from_cache: - if l.success: - passed += 1 - else: - failed += 1 - if l.result_from_cache: + if not l.result_from_cache: + if l.tested: + if l.success: + passed += 1 + else: + failed += 1 + else: cached += 1 + print('--------------------------------------------------------------------------------------------------') -print(len(links), 'links checked: , ', passed, 'passed,', - failed, 'failed', cached, 'results from cache') +print(len(links), 'links checked: ', passed, 'passed,', + failed, 'failed (', cached, 'results from cache)') |