#patent = read.table(file="C:/DataPatentMining/patent_class.txt",
#                sep=',', header=TRUE)

#class = read.csv(file="C:/DataPatentMining/tech_class.csv",
#                   sep=';', header=TRUE)

memory.limit(size = 50000)
#NbRows= 6000000#16000000
#Skip = 46360270
#citation = read.table(file="C:/DataPatentMining/yoann.txt",
#                    sep=',', header=FALSE) #, nrows=NbRows, skip = Skip)
#names(citation)=c("citing", "cited", "y_cited", "y_citing", "class_citing", "class_cited", "app_date_citing", "grant_date_citing", "app_date_cited", "grant_date_cited")
# find starting and endimg value of a block
#start = 2006
#end = 2011
#first = min(which(citation$y_citing == start))
#last =min(which(citation$y_citing == end)) - 1

citation = read.table(file="C:/DataPatentMining/cit_sem_final/cit_sem2006.csv",
                        sep=',', header=TRUE)
citation2 = read.table(file="C:/DataPatentMining/cit_sem_final/cit_sem2007.csv",
                      sep=',', header=TRUE)
citation3 = read.table(file="C:/DataPatentMining/cit_sem_final/cit_sem2008.csv",
                       sep=',', header=TRUE)
citation4 = read.table(file="C:/DataPatentMining/cit_sem_final/cit_sem2009.csv",
                       sep=',', header=TRUE)
citation5 = read.table(file="C:/DataPatentMining/cit_sem_final/cit_sem2010.csv",
                       sep=',', header=TRUE)
l1=length(citation[,1])
l2=length(citation2[,1])
l3=length(citation3[,1])
l4=length(citation4[,1])
l5=length(citation5[,1])
citation[(l1+1):(l1+l2),]=citation2
citation[(l1+l2+1):(l1+l2+l3),]=citation3
citation[(l1+l2+l3+1):(l1+l2+l3+l4),]=citation4
citation[(l1+l2+l3+l4+1):(l1+l2+l3+l4+l5),]=citation5
names(citation)=c("citing", "cited", "y_citing", "y_cited", "class_citing", "proba_citing", "appdate_citing", "year", "class_cited", "proba_cited", "app_date_cited")


#technoProba = load(file="C:/Users/yoann/OneDrive/Documents/DataPatentMining/technoProbas_1977_sizeTh10.RData", envir = parent.frame(), verbose = FALSE)

#exportSubset = read.table(file="C:/Users/yoann/OneDrive/Documents/DataPatentMining/exportSubset.txt",
#                    sep=',', header=TRUE)

#patent10000 = patent[1:10000,]
#p =patent10000[order(patent10000$app_date),] 
#p = p[complete.cases(p),] #remove observations with NA

NbBlocks = 1
#SizeBlock = (last - first)/NbBlocks
SizeBlock=l1+l2+l3+l4+l5
theta_hat=0
sd=0
for(b in 1:NbBlocks)
{
  start = 2006
  end = 2010
#c= citation[first:last,]
c = citation[(1+(b-1)*SizeBlock):(b*SizeBlock),]
#c = c[order(c$app_date_citing),] 
c = c[complete.cases(c),] #remove observations with NA
# clean c from useless observations
citedWindow=c[(c$y_cited > (start-1)) & (c$y_cited < (end+1)),"cited"]
c=c[c$citing %in% citedWindow,]


# remove the ones which are not among citing
#a = c$citing
#c=c[c$cited %in% a,]


threshold=0.04
c = c[c$proba_citing > threshold,]
c = c[c$proba_cited > threshold,]

# add missing patents which are cited
L = length(c[,1])
cadd = unique(c[(c$y_cited > (start-1)) & (c$y_cited < (end+1)),c("cited","app_date_cited")])
Ladd = length(cadd[,1])
c[(L+1):(L+Ladd),c("citing","appdate_citing")]= cadd
c = c[order(c$appdate_citing),] 



# compute the parameter for technological classes
t = table(factor(c[,"class_citing"])) #make a table of factors
lt = length(t)
for (i in 1:lt)
{
  t[[i]]=0
}

tpat = table(factor(c[,"citing"])) #make a table of patents
nb_patents = length(tpat)
#print(nb_patents)
for (i in 1:nb_patents)
{
  tpat[[i]]=0
}
PatList = unique(c$citing)
theta = seq(0.01, 1, by=0.01) #parameter of the model
log_lik=rep(0,100)
zPr=1 # number of citations
Pr=0 # table of probabilities
for (i in 1:nb_patents)
{
  #print(i)
  id = PatList[i]
  subc = c[c$citing==id,]
  tsubc = table(subc$class_citing)
  nbC= length(tsubc) # number of patent's classes 
  n1 = sum(t[tsubc])# nb of patents from the patent's classes
  n2 = length(tpat[tpat >0]) # nb of patents
  if (n2 == 0)
  {
    n2=1
  }
  pr = pmin(.95,n1/n2 + theta) # theoretical probability to cite patents from its own classes
  if (is.na(pr[1]))
  {
    
  }
  else
  {
  #print(pr)
  pcited = unique(subc$cited)
  nb_cited = length(pcited)
  for (k in 1:nb_cited)
  {
    idcited=pcited[k]
    if (idcited %in% names(tpat))
    {
      if (tpat[toString(idcited)] == 1)
      {
        Cl_cited = unique(subc[subc$cited==idcited,"class_cited"])# list of classes from the cited patent
        nb_cl = length(Cl_cited)
        for (l in 1:nb_cl)
        {
          if (Cl_cited[l] %in% names(tsubc))
          {
            log_lik = log_lik + log (pr)
            #print(log (pr))
          }
          else
          {
            log_lik = log_lik + log(1 - pr)
            #print(log(1 - pr))
          }
          Pr[zPr]=pr[1] - 0.01
          zPr = zPr+1
          #print(i)
        }
      }
    }
  }
  tpat[toString(id)]=1 #add the cited patents in the table
  for (j in 1:nbC) # add the corresponding classes
  {
    t[names(tsubc)[j]] = t[names(tsubc)[j]] + 1
  }
  }
}
theta_hat[b] = which.max(log_lik)/100
sd[b] = 1/(sqrt(zPr)*mean(1/(Pr + theta_hat)*(1 - (Pr + theta_hat))))
plot(theta, log_lik)
}
print(threshold)
print(theta_hat)
print(sd)

