వాడుకరి:Mpradeepbot/mpc.wkt.brown.py

వికీపీడియా నుండి
Jump to navigation Jump to search

ఈ ప్రోగ్రాముకు అనుబంధంగా ఈ ఫైలుని వాడండి. బ్రౌను పదకోశం డేటాబేసు ఈ విధంగా ఉంటుంది.

import wikipedia, time, config, codecs

# Replace the contents in the page 'pageTitle' with data 'pageData' 
# and add the comment 'comment'
def writeData(pageTitle, pageData, comment):
  page = wikipedia.Page(wikipedia.getSite(), pageTitle)
  try:
    # Load the page's text from the wiki
    data = page.get()
  except wikipedia.NoPage:
    data = u''
  data = pageData
  try:
    page.put(data, comment = comment)
  except wikipedia.EditConflict:
    wikipedia.output(u'Skipping %s because of edit conflict' % (page.title()))
  except wikipedia.SpamfilterError, url:
    wikipedia.output(u'Cannot change %s because of blacklist entry %s' % (page.title(), url))
  wikipedia.output(u'Waiting for 1 second(s)')
  time.sleep(1)

# Appends the contents the contents to the logfile and writes to the wiktionary 
def writeLogData(pageTitle, pageData, comment, logfile):
  logfile.write(u'Title: '+pageTitle+u'\r\n')
  logfile.write(u'comment: '+comment+u'\r\n')
  logfile.write(pageData + u'\r\n\r\n\r\n')
  writeData(pageTitle, pageData, comment)


# Retrieves the contents of the given page 'pageTitle'
# if page is not present then return an empty string
def getData(pageTitle):
    page = wikipedia.Page(wikipedia.getSite(), pageTitle)

    #get the page from wikipedia
    try:
       pageData = page.get(nofollow_redirects=True)
    except wikipedia.NoPage:
       pageData = u''
    return pageData
  
#  returns the element removing the quotes and the trailing and preceeding white.spaces
def getElement(line, position):
  line  = line.replace('\',', '\'-|-|-')
  words  = line.split('-|-|-')
  fQuote = words[position-1].find('\'') + 1
  lQuote = words[position-1].rfind('\'')
  element = words[position-1][fQuote:lQuote]
  element = element.strip()
  return element

def replacePos(posWords, pos):
  aPos = pos
  if pos == 'a' or pos == 'adj':
    aPos = posWords[0]
  elif pos == 'n':
    aPos = posWords[1]
  elif pos == 'pron':
    aPos = posWords[2]
  elif pos == 'v':
    aPos = posWords[3]
  elif pos == 'p':
    aPos = posWords[4]
  elif pos == 'adv':
    aPos = posWords[5]
  elif pos == 'prep':
    aPos = posWords[6]
  elif pos == 'conj':
    aPos = posWords[7]
  elif pos == 'interj':
    aPos = posWords[8]
  return aPos

dataFile  = open('mpc.wkt.brown.part62.txt', 'rb' )
inputFile = open('mpc.wkt.brown.musa.txt', 'rb' )
logfile = codecs.open('mpc.wkt.brown.log', encoding='utf-8', mode='wb')

#omit 3 characters if it is UTF-8
#dataFile.read(3)
inputFile.read(3)

engName = ''
meaning = ''
pos     = ''
posType = ''

# initialize the parts of speech
posWords = []
count = 0
while count < 9:
  line = u'' + unicode(inputFile.readline(), 'utf8')
  line = line.replace(u'\n',u'')
  line = line.replace(u'\r',u'')
  posWords.append(u'\'\'\'' + line + u'\'\'\'')
  count = count + 1

brownLine = u'' + unicode(inputFile.readline(), 'utf8')
refLine1  = u'' + unicode(inputFile.readline(), 'utf8')
refLine2  = u'' + unicode(inputFile.readline(), 'utf8')
catline   = u'' + unicode(inputFile.readline(), 'utf8')
revLine   = u'' + unicode(inputFile.readline(), 'utf8')

count = 0
site = wikipedia.getSite()

for line in dataFile:
  line = u'' + unicode(line, 'utf8')
  line = line.replace('INSERT INTO `eng2te` VALUES (','')
  line = line.replace('\');','\'')

  engName = getElement(line, 1)
  pos     = getElement(line, 2)
  posType = getElement(line, 3)
  meaning = getElement(line, 4)

  # update the parts of speech
  pos = replacePos(posWords, pos)
  posType = replacePos(posWords, posType)

  # Check if the current page becomes a redirect page
  redirectTo = u''
  if meaning[0:4] == u'See ' or meaning[0:4] == u'see ':
     redirectTo = meaning[4:len(meaning.replace(u'.', u''))]
     if redirectTo[0:3] == u'To' or redirectTo[0:3] == u'to':
       redirectTo = redirectTo[3:len(redirectTo)]

  # Check if current page will have redirects from any page
  redirectFrom = u''
  # the 'to' case
  if engName[0:3] == u'To ' or engName[0:3] == u'to ':
     redirectFrom = engName
     engName = engName[3:len(engName)]
  # the 'or' case
  if engName.find(u' or ') != -1:
     redirectFrom = engName.split(u' or ')[1]
     engName      = engName.split(u' or ')[0]
  # the ','  case
  if engName.find(u',') != -1:
     redirectFrom = engName.split(u',')[1]
     engName      = engName.split(u',')[0]

  engName = engName.replace(u'\'\'', u'\'')
  engName      = engName.lower()
  redirectFrom = redirectFrom.lower()
  redirectTo   = redirectTo.lower()

  # replace the * in meaning with engName
  meaning = meaning.replace(u'*', u'\'\'' + engName + u'\'\'')

  # divide the examples in the meaning
  meaning = meaning.replace(u'. ', u'.')
  meaning = meaning.replace(u'.', u'. ')
  if meaning.count(u'. ') >= 2:
    meaning = meaning.replace(u'. ', u'.\n* ', meaning.count(u'. ')-1)

  # build the text for the pages
  redirectFromData = u''
  mainPageData     = u''

  if redirectFrom != u'':
    redirectFromData = u'#REDIRECT [[' + engName + u']]\n'

  if redirectTo != u'':
    mainPageData = u'#REDIRECT [[' + redirectTo + u']]\n'
    comment = u'Bot: creating redirect page'
  else:
    mainPageData = brownLine
    if pos != u'': 
      mainPageData = mainPageData + pos + u', '
    if posType != u'': 
      mainPageData = mainPageData + posType + u', '
    mainPageData = mainPageData + meaning + u'\n\n\n' 
    mainPageData = mainPageData + refLine1 + refLine2 + u'\n'
    mainPageData = mainPageData + catline + u'\n'
    mainPageData = mainPageData + u'<!-- Interwiki Links -->\n[[en:' + engName + u']]'
    comment = u'Bot: creating page for a word'

  wikipedia.output(u'' + mainPageData)
  wikipedia.output(u'')
  wikipedia.output(u'')
  wikipedia.output(u'')

  #upload to wiktionary
  #upload the redirectFrom page
  if redirectFrom != u'': 
    data = getData(redirectFrom)
    if (data+'\n') == redirectFromData:
      wikipedia.output(u'no need to update any thing')
    elif data == u'':
      writeLogData(redirectFrom, redirectFromData, u'Bot: creating redirect page', logfile)
    else:
      writeLogData(u'Talk:' + redirectFrom, u'Add the following text to main page\n ' + redirectFromData + u'', u'Bot: creating redirect page', logfile)

  #upload the main page 
  data = getData(engName)
  if (data+u'\n') == mainPageData:
    wikipedia.output(u'no need to update any thing')
  elif data == u'':
    writeLogData(engName, mainPageData, comment, logfile)
  else:
    if redirectTo != u'':
      writeLogData(u'Talk:' + engName, u'Add the following text to main page\n ' + mainPageData + u'', comment, logfile)
    else:
      if data.find(mainPageData) != -1:
        wikipedia.output(u'no need to do any update')
      else:
        writeLogData(engName, data + u'\n\n' + mainPageData + u'\n\n' + revLine, u'Bot: Updating word page with meaning from Brown dictionary', logfile)

  count = count + 1
## uncomment the following lines while testing the BOT 
#  if count >= 10:
#    break

print 'Total records uploaded - ' + str(count)

dataFile.close()
inputFile.close()
logfile.close()