Scaricare l’ultima versione di un certo programma
10 novembre 2009
Vi è mai capitato di dover (o voler) scaricare l’ultima versione di uno o più programmi? O sapere semplicemente se è stato rilasciato un aggiornamento?
Bene, a Frafra queste cose capitano, soprattutto quando si parla di sorgenti, compilazione, e compagnia cantante
Ho creato un programma Python (>= 3.x, testato su 3.1.1), che risolve automaticamente questo problema, con l’uso di un parser e di un crawler
Sono cento linee giuste giuste, parzialmente commentate, con tanto di licenza (questa volta metto il file per intero, perché in primo luogo ho raggiunto un numero di linee tondo tondo, e in secondo luogo perché la prima linea è molto importante).
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 | #!/usr/bin/env python3 # -*- coding: utf-8 -*- # # takeit.py # # Copyright 2009 Francesco Frassinelli <fraph24@gmail.com> # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. """ This program shows you the latest version of a source package """ ### HTMLParser patched version, see: http://bugs.python.org/issue755660 # $ cp /usr/local/lib/python3.1/html/parser.py HTMLParser.py # $ wget http://bugs.python.org/file13041/htmlparser_error.diff # $ patch -p0 HTMLParser.py < htmlparser_error.diff import HTMLParser import re, sys, urllib.request servers = { "gnu":"http://ftp.gnu.org/gnu/", "linux":"http://ftp.kernel.org/pub/linux/", "vim":"http://ftp.vim.org/pub/vim/unix/", } # List of available servers: add yours and report test them class Parser(HTMLParser.HTMLParser): def __init__(self, page, callback): super(Parser, self).__init__() self.callback = callback self.page = page # Current page def handle_starttag(self, tag, attrs): if tag == "a": attrs = dict(attrs) if "href" in attrs: self.callback(self.page, attrs["href"]) def error(self, message): pass # Overwriting default function, in order to skip any error class Crawler: def __init__(self): self.source = re.compile(r"""^ (.+?)- # package name ([\d\.-]+) # version \.(tar\.bz2|tar\.gz) # compression $""", re.VERBOSE) # VERBOSE is for multiline commented regex self.packages = dict() def inspect(self, page): print("I'm in:", page) # If there're errors, report it *always* with urllib.request.urlopen(page) as data: parser = Parser(page, self.callback) parser.feed(data.read().decode("utf_8", "ignore")) parser.close() def item(self, page, link): if self.source.match(link): res = self.source.search(link) package, version, compression = res.groups() if package == "0": # Should we use re.match("^\d+$", package)? return # Not properly detected: regex could be wrong if package not in self.packages: self.packages[package] = list() self.packages[package].append((version, page + link)) def callback(self, page, link): # Special link / Backwards link / External link or absolute path useless = "?" in link or ".." in link or link.count("/") > 1 if not useless: if link.endswith("/"): self.inspect(page + link) # Directory: recursive function else: self.item(page, link) # File: analize its name def results(self): for package, version in sorted(self.packages.items()): version, url = sorted(version, reverse = True)[0] yield package, version, url # Just to iterate it def mycrawler(base, directory = ""): crawler = Crawler() crawler.inspect(base + directory) for package, version, url in crawler.results(): print() # Newline ("\n") print("Name:", package) print(" -> Version:", version) print(" -> URL:", url) if __name__ == "__main__": args = len(sys.argv) base = (servers[sys.argv[1]],) if args > 1 else servers.values() directory = set(sys.argv[2:]) if args > 2 else ("",) for server in base: for page in directory: mycrawler(server, page) |
