ebooksgratis.com

See also ebooksgratis.com: no banners, no cookies, totally FREE.

CLASSICISTRANIERI HOME PAGE - YOUTUBE CHANNEL
Privacy Policy Cookie Policy Terms and Conditions
User:Cherybot/linkify.py - Wikipédia

User:Cherybot/linkify.py

A Wikipédiából, a szabad enciklopédiából.

#!/usr/bin/python
# -*- coding: utf-8  -*-
"""
Linkifies a keyword in every articles at the first occurrence.

Usage: linkify.py [OPTIONS] <keyword>

Available options:
  -file:PAGELIST    list of articles to check
  -cat:CATEGORY     category to check
  -xml:XMLFILE      XML dump to check
  -namespace:ID     namespace to work on
"""
#
# hu:User:Chery, January 23, 2007
# Public domain.
#
import wikipedia
import pagegenerators
import sys, re
import catlib

#
# TODO keyword elejen tudjon matchelni ekezetes lowercase-re is
#

class XmlDumpLinkifyPageGenerator:
        def __init__(self, regex, exception, xmlfilename):
                self.regex = regex
                self.exception = exception
                self.xmlfilename = xmlfilename

        def __iter__(self):
                import xmlreader

                mysite = wikipedia.getSite()
                dump = xmlreader.XmlDump(self.xmlfilename)
                for entry in dump.parse():
                        if self.regex.search(entry.text) and not self.exception.search(entry.text):
                                page = wikipedia.Page(mysite, entry.title)
                                yield page

def inspect(gen, regex, exception, keyword):
        msg = {
                'en': u'Robot: Linkifying [[%s]]',
                'hu': u'Robot: Hivatkozás erre: [[%s]]',
        }

        acceptall = False

        for page in gen:
                try:
                        textold = page.get()
                        if not page.canBeEdited():
                                wikipedia.output(u'Skipping locked page %s' % page.title())
                                continue
                except wikipedia.NoPage:
                        wikipedia.output(u'Page %s not found' % page.title())
                        continue

                if exception.search(textold):
                        wikipedia.output(u'Page %s already includes a link;  skipping.' % page.title())
                        continue

                textnew = regex.sub('[[' + keyword + ']]', textold, 1)
                if textnew == textold:
                        wikipedia.output('No changes were necessary in %s' % page.title())
                else:
                        colors = [None] * 5 + [13] * len(page.title()) + [None] * 4
                        wikipedia.output(u'\n>>> %s <<<' % page.title(), colors = colors)
                        wikipedia.showDiff(textold, textnew)
                        if not acceptall:
                                choice = wikipedia.inputChoice(u'Do you want to accept these changes?', ['Yes', 'No', 'All'], ['y', 'N', 'a'], 'N')
                                if choice in ['a', 'A']:
                                        acceptall = True
                        if acceptall or choice in ['y', 'Y']:
                                try:
                                        wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), msg) % keyword)
                                        page.put(textnew)
                                except wikipedia.EditConflict:
                                        wikipedia.output(u'Skipping %s because of edit conflict' % page.title())
                                except wikipedia.SpamfilterError, url:
                                        wikipedia.output(u'Cannot change %s because of blacklist entry %s' % (page.title(), url))

try:
        leavelinktext = False
        reason = ''
        pagelistfile = ''
        xmlfilename = ''
        namespaces = []
        keyword = ''
        gen = None

        for arg in wikipedia.handleArgs():
                if arg.startswith('-namespace:'):
                        namespaces.append(int(arg[11:]))
                elif arg.startswith('-file:'):
                        pagelistfile = arg[6:]
                        gen = pagegenerators.TextfilePageGenerator(pagelistfile)
                elif arg.startswith('-cat:'):
                        categoryname = arg[5:]
                        cat = catlib.Category(wikipedia.getSite(), 'Category:%s' % categoryname)
                        gen = pagegenerators.CategorizedPageGenerator(cat)
                elif arg.startswith('-xml:'):
                        xmlfilename = arg[5:]
                elif arg.startswith('-google'):
                        if len(arg) >= 8:
                                googlequery = arg[8:]
                        gen = pagegenerators.GoogleSearchPageGenerator(googlequery)
                else:
                        if keyword:
                                wikipedia.output(u'Wrong number of arguments;  check header for usage.')
                                wikipedia.stopme()
                                sys.exit()
                        keyword = arg

        if not keyword:
                wikipedia.output(u'Wrong number of arguments;  check header for usage.')
                wikipedia.stopme()
                sys.exit()

        exception = re.compile('\[\[' + keyword + '(\||\]\])')
        regex = re.compile(keyword + '(?![^\[]*?\])(?![^{]*?}})')
        if xmlfilename:
                gen = XmlDumpLinkifyPageGenerator(regex, exception, xmlfilename)

        if not gen:
                wikipedia.output(u'I was unable to generate a pagelist;  exiting.')
                wikipedia.stopme()
                sys.exit()

        if namespaces != []:
                gen = pagegenerators.NamespaceFilterPageGenerator(gen, namespaces)
        pregen = pagegenerators.PreloadingGenerator(gen, pageNumber = 50)
        inspect(pregen, regex, exception, keyword)

finally:
    wikipedia.stopme()


aa - ab - af - ak - als - am - an - ang - ar - arc - as - ast - av - ay - az - ba - bar - bat_smg - bcl - be - be_x_old - bg - bh - bi - bm - bn - bo - bpy - br - bs - bug - bxr - ca - cbk_zam - cdo - ce - ceb - ch - cho - chr - chy - co - cr - crh - cs - csb - cu - cv - cy - da - de - diq - dsb - dv - dz - ee - el - eml - en - eo - es - et - eu - ext - fa - ff - fi - fiu_vro - fj - fo - fr - frp - fur - fy - ga - gan - gd - gl - glk - gn - got - gu - gv - ha - hak - haw - he - hi - hif - ho - hr - hsb - ht - hu - hy - hz - ia - id - ie - ig - ii - ik - ilo - io - is - it - iu - ja - jbo - jv - ka - kaa - kab - kg - ki - kj - kk - kl - km - kn - ko - kr - ks - ksh - ku - kv - kw - ky - la - lad - lb - lbe - lg - li - lij - lmo - ln - lo - lt - lv - map_bms - mdf - mg - mh - mi - mk - ml - mn - mo - mr - mt - mus - my - myv - mzn - na - nah - nap - nds - nds_nl - ne - new - ng - nl - nn - no - nov - nrm - nv - ny - oc - om - or - os - pa - pag - pam - pap - pdc - pi - pih - pl - pms - ps - pt - qu - quality - rm - rmy - rn - ro - roa_rup - roa_tara - ru - rw - sa - sah - sc - scn - sco - sd - se - sg - sh - si - simple - sk - sl - sm - sn - so - sr - srn - ss - st - stq - su - sv - sw - szl - ta - te - tet - tg - th - ti - tk - tl - tlh - tn - to - tpi - tr - ts - tt - tum - tw - ty - udm - ug - uk - ur - uz - ve - vec - vi - vls - vo - wa - war - wo - wuu - xal - xh - yi - yo - za - zea - zh - zh_classical - zh_min_nan - zh_yue - zu -