వాడుకరి:Arjunaraoc/R Scripts/pageview trends yearly

వికీపీడియా నుండి
Jump to navigation Jump to search

##getdata top pageview pages in a 
##given language wikiproject for the specified month from   mediawiki api
##prj project for te wikipedia is te.wikipedia.org
##prj is wikiproject code as used for http://stats.grok.se/json/
##ym is yyyymm for which data is required
##pv is data frame consisting of page title, pagerequests ..
##start getting for all anames,else look for the last article for which 
## data is present and then get data for remaining articles
## "https://wikimedia.org/api/rest_v1/metrics/pageviews/top/te.wikipedia.org/all-access/2018/03/01"
library(jsonlite)
library(lubridate)
pyy_views<-function(prj="te.wikipedia.org",atype="all-access",y="2017") {

   urlprefix<-"https://wikimedia.org/api/rest_v1/metrics/pageviews/top/"
   

   ##read page views data
   ##initialise data frame number first for aligned display in Rrm
   #pv<-data.frame(page_requests=as.integer(),page_title=as.character(),stringsAsFactors=FALSE)
   y<-substr(y,1,4)
   #m<-substr(ym,6,7)

   top<-data.frame(article=as.character(),views=as.integer(),rank=as.integer(),stringsAsFactors=FALSE)
   m<-1
   d<-"all-days"
   given_date<-paste0(y,"/",m,"/",d) 
   maxmonths<-12
   if (leap_year(as.numeric(y)))
      daysinyear=366
   else
      daysinyear=365
   for (m in (1:maxmonths)) {
      mm<-sprintf("%02d",m)
      given_date<-paste0(y,"/",mm,"/",d) 
      url<-paste0(urlprefix,prj,'/',atype,'/',given_date)
      cat("..",m)
      print(url)
      pv<-fromJSON(url,flatten=TRUE)
      pvdf<-pv$items$articles[[1]]
      #update views of found items
      if(m==1) {
        top<-pvdf
      }else {
        mvector<-match(top$article,pvdf$article)
        top$views[top$article %in% pvdf$article]<-top$views[top$article %in% pvdf$article]+pvdf$views[mvector[!is.na(mvector)]]
        rmvector<-match(pvdf$article,top$article)
        top<-rbind(top,pvdf[is.na(rmvector),])
      }
   }
   top<-top[order(-top$views),]
   top$rank<-rank(top$views,ties.method="max")
   top$rank<-max(top$rank)-top$rank+1
   top$views<-round(top$views/daysinyear,0)
   row.names(top)<-NULL
   return (top)
}

#arguments are  list consisting of parameters of top views and then a sublist of articles in a 
#dataframe, with article,views,rank
#compute rank change with latest rank as reference, + more popularity
#output dataframe with additional columns rankchange, viewschangep
pv_trends<-function(latd,eard) {

   mvector<-match(latd$article,eard$article)
   bmvector<-latd$article %in% eard$article
   latd$rankchange[bmvector]<-latd$rank[bmvector]-eard$rank[mvector[!is.na(mvector)]]
   latd$viewschangep[bmvector]<-round(((latd$views[bmvector]-eard$views[mvector[!is.na(mvector)]])/eard$views[mvector[!is.na(mvector)]])*100,2)
   latd$pviews[bmvector]<-eard$views[mvector[!is.na(mvector)]]
   latd<-latd[!is.na(latd$rankchange),]
   row.names(latd)<-NULL
   return (latd)
}