See also ebooksgratis.com: no banners, no cookies, totally FREE.

CLASSICISTRANIERI HOME PAGE - YOUTUBE CHANNEL
Privacy Policy Cookie Policy Terms and Conditions

See also ebooksgratis.com: no banners, no cookies, totally FREE.

CLASSICISTRANIERI HOME PAGE - YOUTUBE CHANNEL
Privacy Policy Cookie Policy Terms and Conditions
సభ్యులు:Mpradeepbot/mpc.wkt.brown.py - వికీపీడియా

సభ్యులు:Mpradeepbot/mpc.wkt.brown.py

వికీపీడియా నుండి

ఈ ప్రోగ్రాముకు అనుబంధంగా ఈ ఫైలుని వాడండి. బ్రౌను పదకోశం డేటాబేసు ఈ విధంగా ఉంటుంది.

import wikipedia, time, config, codecs

# Replace the contents in the page 'pageTitle' with data 'pageData' 
# and add the comment 'comment'
def writeData(pageTitle, pageData, comment):
  page = wikipedia.Page(wikipedia.getSite(), pageTitle)
  try:
    # Load the page's text from the wiki
    data = page.get()
  except wikipedia.NoPage:
    data = u''
  data = pageData
  try:
    page.put(data, comment = comment)
  except wikipedia.EditConflict:
    wikipedia.output(u'Skipping %s because of edit conflict' % (page.title()))
  except wikipedia.SpamfilterError, url:
    wikipedia.output(u'Cannot change %s because of blacklist entry %s' % (page.title(), url))
  wikipedia.output(u'Waiting for 1 second(s)')
  time.sleep(1)

# Appends the contents the contents to the logfile and writes to the wiktionary 
def writeLogData(pageTitle, pageData, comment, logfile):
  logfile.write(u'Title: '+pageTitle+u'\r\n')
  logfile.write(u'comment: '+comment+u'\r\n')
  logfile.write(pageData + u'\r\n\r\n\r\n')
  writeData(pageTitle, pageData, comment)


# Retrieves the contents of the given page 'pageTitle'
# if page is not present then return an empty string
def getData(pageTitle):
    page = wikipedia.Page(wikipedia.getSite(), pageTitle)

    #get the page from wikipedia
    try:
       pageData = page.get(nofollow_redirects=True)
    except wikipedia.NoPage:
       pageData = u''
    return pageData
  
#  returns the element removing the quotes and the trailing and preceeding white.spaces
def getElement(line, position):
  line  = line.replace('\',', '\'-|-|-')
  words  = line.split('-|-|-')
  fQuote = words[position-1].find('\'') + 1
  lQuote = words[position-1].rfind('\'')
  element = words[position-1][fQuote:lQuote]
  element = element.strip()
  return element

def replacePos(posWords, pos):
  aPos = pos
  if pos == 'a' or pos == 'adj':
    aPos = posWords[0]
  elif pos == 'n':
    aPos = posWords[1]
  elif pos == 'pron':
    aPos = posWords[2]
  elif pos == 'v':
    aPos = posWords[3]
  elif pos == 'p':
    aPos = posWords[4]
  elif pos == 'adv':
    aPos = posWords[5]
  elif pos == 'prep':
    aPos = posWords[6]
  elif pos == 'conj':
    aPos = posWords[7]
  elif pos == 'interj':
    aPos = posWords[8]
  return aPos

dataFile  = open('mpc.wkt.brown.part62.txt', 'rb' )
inputFile = open('mpc.wkt.brown.musa.txt', 'rb' )
logfile = codecs.open('mpc.wkt.brown.log', encoding='utf-8', mode='wb')

#omit 3 characters if it is UTF-8
#dataFile.read(3)
inputFile.read(3)

engName = ''
meaning = ''
pos     = ''
posType = ''

# initialize the parts of speech
posWords = []
count = 0
while count < 9:
  line = u'' + unicode(inputFile.readline(), 'utf8')
  line = line.replace(u'\n',u'')
  line = line.replace(u'\r',u'')
  posWords.append(u'\'\'\'' + line + u'\'\'\'')
  count = count + 1

brownLine = u'' + unicode(inputFile.readline(), 'utf8')
refLine1  = u'' + unicode(inputFile.readline(), 'utf8')
refLine2  = u'' + unicode(inputFile.readline(), 'utf8')
catline   = u'' + unicode(inputFile.readline(), 'utf8')
revLine   = u'' + unicode(inputFile.readline(), 'utf8')

count = 0
site = wikipedia.getSite()

for line in dataFile:
  line = u'' + unicode(line, 'utf8')
  line = line.replace('INSERT INTO `eng2te` VALUES (','')
  line = line.replace('\');','\'')

  engName = getElement(line, 1)
  pos     = getElement(line, 2)
  posType = getElement(line, 3)
  meaning = getElement(line, 4)

  # update the parts of speech
  pos = replacePos(posWords, pos)
  posType = replacePos(posWords, posType)

  # Check if the current page becomes a redirect page
  redirectTo = u''
  if meaning[0:4] == u'See ' or meaning[0:4] == u'see ':
     redirectTo = meaning[4:len(meaning.replace(u'.', u''))]
     if redirectTo[0:3] == u'To' or redirectTo[0:3] == u'to':
       redirectTo = redirectTo[3:len(redirectTo)]

  # Check if current page will have redirects from any page
  redirectFrom = u''
  # the 'to' case
  if engName[0:3] == u'To ' or engName[0:3] == u'to ':
     redirectFrom = engName
     engName = engName[3:len(engName)]
  # the 'or' case
  if engName.find(u' or ') != -1:
     redirectFrom = engName.split(u' or ')[1]
     engName      = engName.split(u' or ')[0]
  # the ','  case
  if engName.find(u',') != -1:
     redirectFrom = engName.split(u',')[1]
     engName      = engName.split(u',')[0]

  engName = engName.replace(u'\'\'', u'\'')
  engName      = engName.lower()
  redirectFrom = redirectFrom.lower()
  redirectTo   = redirectTo.lower()

  # replace the * in meaning with engName
  meaning = meaning.replace(u'*', u'\'\'' + engName + u'\'\'')

  # divide the examples in the meaning
  meaning = meaning.replace(u'. ', u'.')
  meaning = meaning.replace(u'.', u'. ')
  if meaning.count(u'. ') >= 2:
    meaning = meaning.replace(u'. ', u'.\n* ', meaning.count(u'. ')-1)

  # build the text for the pages
  redirectFromData = u''
  mainPageData     = u''

  if redirectFrom != u'':
    redirectFromData = u'#REDIRECT [[' + engName + u']]\n'

  if redirectTo != u'':
    mainPageData = u'#REDIRECT [[' + redirectTo + u']]\n'
    comment = u'Bot: creating redirect page'
  else:
    mainPageData = brownLine
    if pos != u'': 
      mainPageData = mainPageData + pos + u', '
    if posType != u'': 
      mainPageData = mainPageData + posType + u', '
    mainPageData = mainPageData + meaning + u'\n\n\n' 
    mainPageData = mainPageData + refLine1 + refLine2 + u'\n'
    mainPageData = mainPageData + catline + u'\n'
    mainPageData = mainPageData + u'<!-- Interwiki Links -->\n[[en:' + engName + u']]'
    comment = u'Bot: creating page for a word'

  wikipedia.output(u'' + mainPageData)
  wikipedia.output(u'')
  wikipedia.output(u'')
  wikipedia.output(u'')

  #upload to wiktionary
  #upload the redirectFrom page
  if redirectFrom != u'': 
    data = getData(redirectFrom)
    if (data+'\n') == redirectFromData:
      wikipedia.output(u'no need to update any thing')
    elif data == u'':
      writeLogData(redirectFrom, redirectFromData, u'Bot: creating redirect page', logfile)
    else:
      writeLogData(u'Talk:' + redirectFrom, u'Add the following text to main page\n ' + redirectFromData + u'', u'Bot: creating redirect page', logfile)

  #upload the main page 
  data = getData(engName)
  if (data+u'\n') == mainPageData:
    wikipedia.output(u'no need to update any thing')
  elif data == u'':
    writeLogData(engName, mainPageData, comment, logfile)
  else:
    if redirectTo != u'':
      writeLogData(u'Talk:' + engName, u'Add the following text to main page\n ' + mainPageData + u'', comment, logfile)
    else:
      if data.find(mainPageData) != -1:
        wikipedia.output(u'no need to do any update')
      else:
        writeLogData(engName, data + u'\n\n' + mainPageData + u'\n\n' + revLine, u'Bot: Updating word page with meaning from Brown dictionary', logfile)

  count = count + 1
## uncomment the following lines while testing the BOT 
#  if count >= 10:
#    break

print 'Total records uploaded - ' + str(count)

dataFile.close()
inputFile.close()
logfile.close()


aa - ab - af - ak - als - am - an - ang - ar - arc - as - ast - av - ay - az - ba - bar - bat_smg - bcl - be - be_x_old - bg - bh - bi - bm - bn - bo - bpy - br - bs - bug - bxr - ca - cbk_zam - cdo - ce - ceb - ch - cho - chr - chy - co - cr - crh - cs - csb - cu - cv - cy - da - de - diq - dsb - dv - dz - ee - el - eml - en - eo - es - et - eu - ext - fa - ff - fi - fiu_vro - fj - fo - fr - frp - fur - fy - ga - gan - gd - gl - glk - gn - got - gu - gv - ha - hak - haw - he - hi - hif - ho - hr - hsb - ht - hu - hy - hz - ia - id - ie - ig - ii - ik - ilo - io - is - it - iu - ja - jbo - jv - ka - kaa - kab - kg - ki - kj - kk - kl - km - kn - ko - kr - ks - ksh - ku - kv - kw - ky - la - lad - lb - lbe - lg - li - lij - lmo - ln - lo - lt - lv - map_bms - mdf - mg - mh - mi - mk - ml - mn - mo - mr - mt - mus - my - myv - mzn - na - nah - nap - nds - nds_nl - ne - new - ng - nl - nn - no - nov - nrm - nv - ny - oc - om - or - os - pa - pag - pam - pap - pdc - pi - pih - pl - pms - ps - pt - qu - quality - rm - rmy - rn - ro - roa_rup - roa_tara - ru - rw - sa - sah - sc - scn - sco - sd - se - sg - sh - si - simple - sk - sl - sm - sn - so - sr - srn - ss - st - stq - su - sv - sw - szl - ta - te - tet - tg - th - ti - tk - tl - tlh - tn - to - tpi - tr - ts - tt - tum - tw - ty - udm - ug - uk - ur - uz - ve - vec - vi - vls - vo - wa - war - wo - wuu - xal - xh - yi - yo - za - zea - zh - zh_classical - zh_min_nan - zh_yue - zu -


aa - ab - af - ak - als - am - an - ang - ar - arc - as - ast - av - ay - az - ba - bar - bat_smg - bcl - be - be_x_old - bg - bh - bi - bm - bn - bo - bpy - br - bs - bug - bxr - ca - cbk_zam - cdo - ce - ceb - ch - cho - chr - chy - co - cr - crh - cs - csb - cu - cv - cy - da - de - diq - dsb - dv - dz - ee - el - eml - en - eo - es - et - eu - ext - fa - ff - fi - fiu_vro - fj - fo - fr - frp - fur - fy - ga - gan - gd - gl - glk - gn - got - gu - gv - ha - hak - haw - he - hi - hif - ho - hr - hsb - ht - hu - hy - hz - ia - id - ie - ig - ii - ik - ilo - io - is - it - iu - ja - jbo - jv - ka - kaa - kab - kg - ki - kj - kk - kl - km - kn - ko - kr - ks - ksh - ku - kv - kw - ky - la - lad - lb - lbe - lg - li - lij - lmo - ln - lo - lt - lv - map_bms - mdf - mg - mh - mi - mk - ml - mn - mo - mr - mt - mus - my - myv - mzn - na - nah - nap - nds - nds_nl - ne - new - ng - nl - nn - no - nov - nrm - nv - ny - oc - om - or - os - pa - pag - pam - pap - pdc - pi - pih - pl - pms - ps - pt - qu - quality - rm - rmy - rn - ro - roa_rup - roa_tara - ru - rw - sa - sah - sc - scn - sco - sd - se - sg - sh - si - simple - sk - sl - sm - sn - so - sr - srn - ss - st - stq - su - sv - sw - szl - ta - te - tet - tg - th - ti - tk - tl - tlh - tn - to - tpi - tr - ts - tt - tum - tw - ty - udm - ug - uk - ur - uz - ve - vec - vi - vls - vo - wa - war - wo - wuu - xal - xh - yi - yo - za - zea - zh - zh_classical - zh_min_nan - zh_yue - zu -