A Wikipédiából, a szabad enciklopédiából.
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Linkifies a keyword in every articles at the first occurrence.
Usage: linkify.py [OPTIONS] <keyword>
Available options:
-file:PAGELIST list of articles to check
-cat:CATEGORY category to check
-xml:XMLFILE XML dump to check
-namespace:ID namespace to work on
"""
#
# hu:User:Chery, January 23, 2007
# Public domain.
#
import wikipedia
import pagegenerators
import sys, re
import catlib
#
# TODO keyword elejen tudjon matchelni ekezetes lowercase-re is
#
class XmlDumpLinkifyPageGenerator:
def __init__(self, regex, exception, xmlfilename):
self.regex = regex
self.exception = exception
self.xmlfilename = xmlfilename
def __iter__(self):
import xmlreader
mysite = wikipedia.getSite()
dump = xmlreader.XmlDump(self.xmlfilename)
for entry in dump.parse():
if self.regex.search(entry.text) and not self.exception.search(entry.text):
page = wikipedia.Page(mysite, entry.title)
yield page
def inspect(gen, regex, exception, keyword):
msg = {
'en': u'Robot: Linkifying [[%s]]',
'hu': u'Robot: Hivatkozás erre: [[%s]]',
}
acceptall = False
for page in gen:
try:
textold = page.get()
if not page.canBeEdited():
wikipedia.output(u'Skipping locked page %s' % page.title())
continue
except wikipedia.NoPage:
wikipedia.output(u'Page %s not found' % page.title())
continue
if exception.search(textold):
wikipedia.output(u'Page %s already includes a link; skipping.' % page.title())
continue
textnew = regex.sub('[[' + keyword + ']]', textold, 1)
if textnew == textold:
wikipedia.output('No changes were necessary in %s' % page.title())
else:
colors = [None] * 5 + [13] * len(page.title()) + [None] * 4
wikipedia.output(u'\n>>> %s <<<' % page.title(), colors = colors)
wikipedia.showDiff(textold, textnew)
if not acceptall:
choice = wikipedia.inputChoice(u'Do you want to accept these changes?', ['Yes', 'No', 'All'], ['y', 'N', 'a'], 'N')
if choice in ['a', 'A']:
acceptall = True
if acceptall or choice in ['y', 'Y']:
try:
wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), msg) % keyword)
page.put(textnew)
except wikipedia.EditConflict:
wikipedia.output(u'Skipping %s because of edit conflict' % page.title())
except wikipedia.SpamfilterError, url:
wikipedia.output(u'Cannot change %s because of blacklist entry %s' % (page.title(), url))
try:
leavelinktext = False
reason = ''
pagelistfile = ''
xmlfilename = ''
namespaces = []
keyword = ''
gen = None
for arg in wikipedia.handleArgs():
if arg.startswith('-namespace:'):
namespaces.append(int(arg[11:]))
elif arg.startswith('-file:'):
pagelistfile = arg[6:]
gen = pagegenerators.TextfilePageGenerator(pagelistfile)
elif arg.startswith('-cat:'):
categoryname = arg[5:]
cat = catlib.Category(wikipedia.getSite(), 'Category:%s' % categoryname)
gen = pagegenerators.CategorizedPageGenerator(cat)
elif arg.startswith('-xml:'):
xmlfilename = arg[5:]
elif arg.startswith('-google'):
if len(arg) >= 8:
googlequery = arg[8:]
gen = pagegenerators.GoogleSearchPageGenerator(googlequery)
else:
if keyword:
wikipedia.output(u'Wrong number of arguments; check header for usage.')
wikipedia.stopme()
sys.exit()
keyword = arg
if not keyword:
wikipedia.output(u'Wrong number of arguments; check header for usage.')
wikipedia.stopme()
sys.exit()
exception = re.compile('\[\[' + keyword + '(\||\]\])')
regex = re.compile(keyword + '(?![^\[]*?\])(?![^{]*?}})')
if xmlfilename:
gen = XmlDumpLinkifyPageGenerator(regex, exception, xmlfilename)
if not gen:
wikipedia.output(u'I was unable to generate a pagelist; exiting.')
wikipedia.stopme()
sys.exit()
if namespaces != []:
gen = pagegenerators.NamespaceFilterPageGenerator(gen, namespaces)
pregen = pagegenerators.PreloadingGenerator(gen, pageNumber = 50)
inspect(pregen, regex, exception, keyword)
finally:
wikipedia.stopme()