It is very common, and even recommended, for programmers to include detailed comments and metadata on their source code. However, comments and metadata included into the HTML code might reveal internal information that should not be available to potential attackers. Comments and metadata review should be done in order to determine if any information is being leaked.
Simple python script, script will copy all comments from html and store them in a log file.
# python 3.4 import urllib.request import urllib.parse import re project = "blog_btbw" urlFile = project + "/url_20.txt" filterUrl = project + "/filter.txt" file_ = open('log.txt', 'w') garbage = [] def get_content(_url): req = urllib.request.Request(_url, method='GET') try: response = urllib.request.urlopen(req) # print("Success: " + response.read().decode('utf8')) return response.read().decode('utf8') except urllib.error.URLError as error: # print("Error: " + error.read().decode('utf8')) print("Error: " + _url) return "" def get_html_comments(_html): p = re.compile('<!--(.+?)-->') return p.findall(_html) def log_line(_line): file_.write(_line) def init_filter(): with open(filterUrl) as filterFile: for one_filter in filterFile: garbage.append(one_filter.strip()) def is_valid(_content): return (_content.strip() in garbage) == False init_filter() with open(urlFile) as urlFile: for idxUrl, urlLine in enumerate(urlFile): try: url = urlLine.strip() log_line("---------- " + url + " ----------\n") content = get_content(url) comments = get_html_comments(content.strip().replace('\n', '')) for idx, comment in enumerate(comments): if is_valid(comment): log_line(comment.strip() + "\n") log_line("--------------------------\n") print(idxUrl, url) except Exception as error: print(error)
blog_btbw/filter.txt
.col-md-12 .entry-meta .entry-header .entry-summary
blog_btbw/url_20.txt
http://www.blog.btbw.pl/ http://www.blog.btbw.pl/robots.txt http://www.blog.btbw.pl/sitemap.xml http://www.blog.btbw.pl/category/java/ http://www.blog.btbw.pl/category/java-script/angularjs/