User:Cherybot/linkify.py

A Wikipédiából, a szabad enciklopédiából.

#!/usr/bin/python
# -*- coding: utf-8  -*-
"""
Linkifies a keyword in every articles at the first occurrence.

Usage: linkify.py [OPTIONS] <keyword>

Available options:
  -file:PAGELIST    list of articles to check
  -cat:CATEGORY     category to check
  -xml:XMLFILE      XML dump to check
  -namespace:ID     namespace to work on
"""
#
# hu:User:Chery, January 23, 2007
# Public domain.
#
import wikipedia
import pagegenerators
import sys, re
import catlib

#
# TODO keyword elejen tudjon matchelni ekezetes lowercase-re is
#

class XmlDumpLinkifyPageGenerator:
        def __init__(self, regex, exception, xmlfilename):
                self.regex = regex
                self.exception = exception
                self.xmlfilename = xmlfilename

        def __iter__(self):
                import xmlreader

                mysite = wikipedia.getSite()
                dump = xmlreader.XmlDump(self.xmlfilename)
                for entry in dump.parse():
                        if self.regex.search(entry.text) and not self.exception.search(entry.text):
                                page = wikipedia.Page(mysite, entry.title)
                                yield page

def inspect(gen, regex, exception, keyword):
        msg = {
                'en': u'Robot: Linkifying [[%s]]',
                'hu': u'Robot: HivatkozÃ¡s erre: [[%s]]',
        }

        acceptall = False

        for page in gen:
                try:
                        textold = page.get()
                        if not page.canBeEdited():
                                wikipedia.output(u'Skipping locked page %s' % page.title())
                                continue
                except wikipedia.NoPage:
                        wikipedia.output(u'Page %s not found' % page.title())
                        continue

                if exception.search(textold):
                        wikipedia.output(u'Page %s already includes a link;  skipping.' % page.title())
                        continue

                textnew = regex.sub('[[' + keyword + ']]', textold, 1)
                if textnew == textold:
                        wikipedia.output('No changes were necessary in %s' % page.title())
                else:
                        colors = [None] * 5 + [13] * len(page.title()) + [None] * 4
                        wikipedia.output(u'\n>>> %s <<<' % page.title(), colors = colors)
                        wikipedia.showDiff(textold, textnew)
                        if not acceptall:
                                choice = wikipedia.inputChoice(u'Do you want to accept these changes?', ['Yes', 'No', 'All'], ['y', 'N', 'a'], 'N')
                                if choice in ['a', 'A']:
                                        acceptall = True
                        if acceptall or choice in ['y', 'Y']:
                                try:
                                        wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), msg) % keyword)
                                        page.put(textnew)
                                except wikipedia.EditConflict:
                                        wikipedia.output(u'Skipping %s because of edit conflict' % page.title())
                                except wikipedia.SpamfilterError, url:
                                        wikipedia.output(u'Cannot change %s because of blacklist entry %s' % (page.title(), url))

try:
        leavelinktext = False
        reason = ''
        pagelistfile = ''
        xmlfilename = ''
        namespaces = []
        keyword = ''
        gen = None

        for arg in wikipedia.handleArgs():
                if arg.startswith('-namespace:'):
                        namespaces.append(int(arg[11:]))
                elif arg.startswith('-file:'):
                        pagelistfile = arg[6:]
                        gen = pagegenerators.TextfilePageGenerator(pagelistfile)
                elif arg.startswith('-cat:'):
                        categoryname = arg[5:]
                        cat = catlib.Category(wikipedia.getSite(), 'Category:%s' % categoryname)
                        gen = pagegenerators.CategorizedPageGenerator(cat)
                elif arg.startswith('-xml:'):
                        xmlfilename = arg[5:]
                elif arg.startswith('-google'):
                        if len(arg) >= 8:
                                googlequery = arg[8:]
                        gen = pagegenerators.GoogleSearchPageGenerator(googlequery)
                else:
                        if keyword:
                                wikipedia.output(u'Wrong number of arguments;  check header for usage.')
                                wikipedia.stopme()
                                sys.exit()
                        keyword = arg

        if not keyword:
                wikipedia.output(u'Wrong number of arguments;  check header for usage.')
                wikipedia.stopme()
                sys.exit()

        exception = re.compile('\[\[' + keyword + '(\||\]\])')
        regex = re.compile(keyword + '(?![^\[]*?\])(?![^{]*?}})')
        if xmlfilename:
                gen = XmlDumpLinkifyPageGenerator(regex, exception, xmlfilename)

        if not gen:
                wikipedia.output(u'I was unable to generate a pagelist;  exiting.')
                wikipedia.stopme()
                sys.exit()

        if namespaces != []:
                gen = pagegenerators.NamespaceFilterPageGenerator(gen, namespaces)
        pregen = pagegenerators.PreloadingGenerator(gen, pageNumber = 50)
        inspect(pregen, regex, exception, keyword)

finally:
    wikipedia.stopme()
See also ebooksgratis.com: no banners, no cookies, totally FREE.

User:Cherybot/linkify.py

A Wikipédiából, a szabad enciklopédiából.

Views

Navigáció

részvétel

Keresés