1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
| #!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# takeit.py
#
# Copyright 2009 Francesco Frassinelli <fraph24@gmail.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
""" This program shows you the latest version of a source package """
### HTMLParser patched version, see: http://bugs.python.org/issue755660
# $ cp /usr/local/lib/python3.1/html/parser.py HTMLParser.py
# $ wget http://bugs.python.org/file13041/htmlparser_error.diff
# $ patch -p0 HTMLParser.py < htmlparser_error.diff
import HTMLParser
import re, sys, urllib.request
servers = {
"gnu":"http://ftp.gnu.org/gnu/",
"linux":"http://ftp.kernel.org/pub/linux/",
"vim":"http://ftp.vim.org/pub/vim/unix/",
} # List of available servers: add yours and report test them
class Parser(HTMLParser.HTMLParser):
def __init__(self, page, callback):
super(Parser, self).__init__()
self.callback = callback
self.page = page # Current page
def handle_starttag(self, tag, attrs):
if tag == "a":
attrs = dict(attrs)
if "href" in attrs:
self.callback(self.page, attrs["href"])
def error(self, message):
pass # Overwriting default function, in order to skip any error
class Crawler:
def __init__(self):
self.source = re.compile(r"""^
(.+?)- # package name
([\d\.-]+) # version
\.(tar\.bz2|tar\.gz) # compression
$""", re.VERBOSE) # VERBOSE is for multiline commented regex
self.packages = dict()
def inspect(self, page):
print("I'm in:", page) # If there're errors, report it *always*
with urllib.request.urlopen(page) as data:
parser = Parser(page, self.callback)
parser.feed(data.read().decode("utf_8", "ignore"))
parser.close()
def item(self, page, link):
if self.source.match(link):
res = self.source.search(link)
package, version, compression = res.groups()
if package == "0": # Should we use re.match("^\d+$", package)?
return # Not properly detected: regex could be wrong
if package not in self.packages:
self.packages[package] = list()
self.packages[package].append((version, page + link))
def callback(self, page, link):
# Special link / Backwards link / External link or absolute path
useless = "?" in link or ".." in link or link.count("/") > 1
if not useless:
if link.endswith("/"):
self.inspect(page + link) # Directory: recursive function
else:
self.item(page, link) # File: analize its name
def results(self):
for package, version in sorted(self.packages.items()):
version, url = sorted(version, reverse = True)[0]
yield package, version, url # Just to iterate it
def mycrawler(base, directory = ""):
crawler = Crawler()
crawler.inspect(base + directory)
for package, version, url in crawler.results():
print() # Newline ("\n")
print("Name:", package)
print(" -> Version:", version)
print(" -> URL:", url)
if __name__ == "__main__":
args = len(sys.argv)
base = (servers[sys.argv[1]],) if args > 1 else servers.values()
directory = set(sys.argv[2:]) if args > 2 else ("",)
for server in base:
for page in directory:
mycrawler(server, page) |