సభ్యులు:Mpradeepbot/mpc.wkt.brown.py
వికీపీడియా నుండి
ఈ ప్రోగ్రాముకు అనుబంధంగా ఈ ఫైలుని వాడండి. బ్రౌను పదకోశం డేటాబేసు ఈ విధంగా ఉంటుంది.
import wikipedia, time, config, codecs # Replace the contents in the page 'pageTitle' with data 'pageData' # and add the comment 'comment' def writeData(pageTitle, pageData, comment): page = wikipedia.Page(wikipedia.getSite(), pageTitle) try: # Load the page's text from the wiki data = page.get() except wikipedia.NoPage: data = u'' data = pageData try: page.put(data, comment = comment) except wikipedia.EditConflict: wikipedia.output(u'Skipping %s because of edit conflict' % (page.title())) except wikipedia.SpamfilterError, url: wikipedia.output(u'Cannot change %s because of blacklist entry %s' % (page.title(), url)) wikipedia.output(u'Waiting for 1 second(s)') time.sleep(1) # Appends the contents the contents to the logfile and writes to the wiktionary def writeLogData(pageTitle, pageData, comment, logfile): logfile.write(u'Title: '+pageTitle+u'\r\n') logfile.write(u'comment: '+comment+u'\r\n') logfile.write(pageData + u'\r\n\r\n\r\n') writeData(pageTitle, pageData, comment) # Retrieves the contents of the given page 'pageTitle' # if page is not present then return an empty string def getData(pageTitle): page = wikipedia.Page(wikipedia.getSite(), pageTitle) #get the page from wikipedia try: pageData = page.get(nofollow_redirects=True) except wikipedia.NoPage: pageData = u'' return pageData # returns the element removing the quotes and the trailing and preceeding white.spaces def getElement(line, position): line = line.replace('\',', '\'-|-|-') words = line.split('-|-|-') fQuote = words[position-1].find('\'') + 1 lQuote = words[position-1].rfind('\'') element = words[position-1][fQuote:lQuote] element = element.strip() return element def replacePos(posWords, pos): aPos = pos if pos == 'a' or pos == 'adj': aPos = posWords[0] elif pos == 'n': aPos = posWords[1] elif pos == 'pron': aPos = posWords[2] elif pos == 'v': aPos = posWords[3] elif pos == 'p': aPos = posWords[4] elif pos == 'adv': aPos = posWords[5] elif pos == 'prep': aPos = posWords[6] elif pos == 'conj': aPos = posWords[7] elif pos == 'interj': aPos = posWords[8] return aPos dataFile = open('mpc.wkt.brown.part62.txt', 'rb' ) inputFile = open('mpc.wkt.brown.musa.txt', 'rb' ) logfile = codecs.open('mpc.wkt.brown.log', encoding='utf-8', mode='wb') #omit 3 characters if it is UTF-8 #dataFile.read(3) inputFile.read(3) engName = '' meaning = '' pos = '' posType = '' # initialize the parts of speech posWords = [] count = 0 while count < 9: line = u'' + unicode(inputFile.readline(), 'utf8') line = line.replace(u'\n',u'') line = line.replace(u'\r',u'') posWords.append(u'\'\'\'' + line + u'\'\'\'') count = count + 1 brownLine = u'' + unicode(inputFile.readline(), 'utf8') refLine1 = u'' + unicode(inputFile.readline(), 'utf8') refLine2 = u'' + unicode(inputFile.readline(), 'utf8') catline = u'' + unicode(inputFile.readline(), 'utf8') revLine = u'' + unicode(inputFile.readline(), 'utf8') count = 0 site = wikipedia.getSite() for line in dataFile: line = u'' + unicode(line, 'utf8') line = line.replace('INSERT INTO `eng2te` VALUES (','') line = line.replace('\');','\'') engName = getElement(line, 1) pos = getElement(line, 2) posType = getElement(line, 3) meaning = getElement(line, 4) # update the parts of speech pos = replacePos(posWords, pos) posType = replacePos(posWords, posType) # Check if the current page becomes a redirect page redirectTo = u'' if meaning[0:4] == u'See ' or meaning[0:4] == u'see ': redirectTo = meaning[4:len(meaning.replace(u'.', u''))] if redirectTo[0:3] == u'To' or redirectTo[0:3] == u'to': redirectTo = redirectTo[3:len(redirectTo)] # Check if current page will have redirects from any page redirectFrom = u'' # the 'to' case if engName[0:3] == u'To ' or engName[0:3] == u'to ': redirectFrom = engName engName = engName[3:len(engName)] # the 'or' case if engName.find(u' or ') != -1: redirectFrom = engName.split(u' or ')[1] engName = engName.split(u' or ')[0] # the ',' case if engName.find(u',') != -1: redirectFrom = engName.split(u',')[1] engName = engName.split(u',')[0] engName = engName.replace(u'\'\'', u'\'') engName = engName.lower() redirectFrom = redirectFrom.lower() redirectTo = redirectTo.lower() # replace the * in meaning with engName meaning = meaning.replace(u'*', u'\'\'' + engName + u'\'\'') # divide the examples in the meaning meaning = meaning.replace(u'. ', u'.') meaning = meaning.replace(u'.', u'. ') if meaning.count(u'. ') >= 2: meaning = meaning.replace(u'. ', u'.\n* ', meaning.count(u'. ')-1) # build the text for the pages redirectFromData = u'' mainPageData = u'' if redirectFrom != u'': redirectFromData = u'#REDIRECT [[' + engName + u']]\n' if redirectTo != u'': mainPageData = u'#REDIRECT [[' + redirectTo + u']]\n' comment = u'Bot: creating redirect page' else: mainPageData = brownLine if pos != u'': mainPageData = mainPageData + pos + u', ' if posType != u'': mainPageData = mainPageData + posType + u', ' mainPageData = mainPageData + meaning + u'\n\n\n' mainPageData = mainPageData + refLine1 + refLine2 + u'\n' mainPageData = mainPageData + catline + u'\n' mainPageData = mainPageData + u'<!-- Interwiki Links -->\n[[en:' + engName + u']]' comment = u'Bot: creating page for a word' wikipedia.output(u'' + mainPageData) wikipedia.output(u'') wikipedia.output(u'') wikipedia.output(u'') #upload to wiktionary #upload the redirectFrom page if redirectFrom != u'': data = getData(redirectFrom) if (data+'\n') == redirectFromData: wikipedia.output(u'no need to update any thing') elif data == u'': writeLogData(redirectFrom, redirectFromData, u'Bot: creating redirect page', logfile) else: writeLogData(u'Talk:' + redirectFrom, u'Add the following text to main page\n ' + redirectFromData + u'', u'Bot: creating redirect page', logfile) #upload the main page data = getData(engName) if (data+u'\n') == mainPageData: wikipedia.output(u'no need to update any thing') elif data == u'': writeLogData(engName, mainPageData, comment, logfile) else: if redirectTo != u'': writeLogData(u'Talk:' + engName, u'Add the following text to main page\n ' + mainPageData + u'', comment, logfile) else: if data.find(mainPageData) != -1: wikipedia.output(u'no need to do any update') else: writeLogData(engName, data + u'\n\n' + mainPageData + u'\n\n' + revLine, u'Bot: Updating word page with meaning from Brown dictionary', logfile) count = count + 1 ## uncomment the following lines while testing the BOT # if count >= 10: # break print 'Total records uploaded - ' + str(count) dataFile.close() inputFile.close() logfile.close()