# Implementations of the likelihood ratio test of Hardy-weinberg equilibrium for uncertain genotype in sibsips.
# Qiong Li, Helene Massam and Xin Gao
# June 27, 2013

# Arguments
# num: a matrix of the numbers of sibships, with genptype configuration (AA), (Aa) and (aa) in rows and columns.
# u: the probability of being incorrectly coded as a "a" allele when "A" is a true allele.
# y: the probability of being incorrectly coded as a "A" allele when "a" is a true allele.

# Examples
# y <- 0.01   #epsilon1
# u <- 0.01   #epsilon2
# c <- c(2,3,0,1,22,10,0,8,54) # under HWE
# c <- c(10,5,1,4,39,13,1,11,16) # without HWE
# num <- matrix(c,nrow =3, ncol=3) # num[1,2] is the number of sibships with genotype (AA, Aa)
# HWEtest.error(num, y, u)

HWEtest.error <- function(num, y, u)
{ 
  
  
  q <- matrix(0, nrow=6, ncol=6) # Penetrance matrix
  
  q[1,1] <- (1-y)^2*(1-y)^2
  q[1,2] <- 1/4*(1-y)^2*(1-y)^2+2*1/4*(1-y)^2*u*(1-y)+1/4*u*(1-y)*u*(1-y)
  q[1,3] <- u*(1-y)*u*(1-y)
  q[1,4] <- 1/16*(1-y)^2*(1-y)^2+2*1/8*(1-y)^2*u*(1-y)+2*1/16*(1-y)^2*u^2+1/4*u*(1-y)*u*(1-y)+2*1/8*u*(1-y)*u^2+1/16*u^2*u^2
  q[1,5] <- 1/4*u*(1-y)*u*(1-y)+2*1/4*u*(1-y)*u^2+1/4*u^2*u^2
  q[1,6] <- u^2*u^2
  
  q[2,1] <- (1-y)^2*2*y*(1-y)
  q[2,2] <- 1/4*(1-y)^2*2*y*(1-y)+1/4*(1-y)^2*(y*u+(1-y)*(1-u))+1/4*u*(1-y)*2*y*(1-y)+1/4*u*(1-y)*(y*u+(1-y)*(1-u))
  q[2,3] <- u*(1-y)*(y*u+(1-y)*(1-u))
  q[2,4] <- 1/16*(1-y)^2*2*y*(1-y)+1/8*(1-y)^2*(y*u+(1-y)*(1-u))+1/8*u*(1-y)*2*y*(1-y)+1/16*(1-y)^2*2*u*(1-u)+
    1/16*u^2*2*y*(1-y)+1/4*u*(1-y)*(y*u+(1-y)*(1-u))+1/8*u*(1-y)*2*u*(1-u)+1/8*u^2*(y*u+(1-y)*(1-u))+1/16*u^2*2*u*(1-u)
  q[2,5] <- 1/4*u*(1-y)*(y*u+(1-y)*(1-u))+1/4*u*(1-y)*2*u*(1-u)+1/4*u^2*(y*u+(1-y)*(1-u))+1/4*u^2*2*u*(1-u)
  q[2,6] <- u^2*2*u*(1-u)
  
  q[3,1] <- (1-y)^2*y^2
  q[3,2] <- 1/4*(1-y)^2*y^2+1/4*(1-y)^2*y*(1-u)+1/4*u*(1-y)*y^2+1/4*u*(1-y)*y*(1-u)
  q[3,3] <- u*(1-y)*y*(1-u)
  q[3,4] <- 1/16*(1-y)^2*y^2+1/8*(1-y)^2*y*(1-u)+1/8*u*(1-y)*y^2+1/16*(1-y)^2*(1-u)^2+1/16*u^2*y^2+1/4*u*(1-y)*y*(1-u)+
    1/8*u*(1-y)*(1-u)^2+1/8*u^2*y*(1-u)+1/16*u^2*(1-u)^2
  q[3,5] <- 1/4*u*(1-y)*y*(1-u)+1/4*u*(1-y)*(1-u)^2+1/4*u^2*y*(1-u)+1/4*u^2*(1-u)^2
  q[3,6] <- u^2*(1-u)^2
  
  q[4,1] <- 2*y*(1-y)*2*y*(1-y)
  q[4,2] <- 1/4*2*y*(1-y)*2*y*(1-y)+2*1/4*2*y*(1-y)*(y*u+(1-y)*(1-u))+1/4*(y*u+(1-y)*(1-u))^2
  q[4,3] <- (y*u+(1-y)*(1-u))^2
  q[4,4] <- 1/16*2*y*(1-y)*2*y*(1-y)+2*1/8*2*y*(1-y)*(y*u+(1-y)*(1-u))+2*1/16*2*y*(1-y)*2*u*(1-u)+1/4*(y*u+(1-y)*(1-u))^2+
    2*1/8*(y*u+(1-y)*(1-u))*2*u*(1-u)+1/16*2*u*(1-u)*2*u*(1-u)
  q[4,5] <- 1/4*(y*u+(1-y)*(1-u))^2+2*1/4*(y*u+(1-y)*(1-u))*2*u*(1-u)+1/4*2*u*(1-u)*2*u*(1-u)
  q[4,6] <- 2*u*(1-u)*2*u*(1-u)
  
  q[5,1] <- 2*y*(1-y)*y^2
  q[5,2] <- 1/4*2*y*(1-y)*y^2+1/4*2*y*(1-y)*y*(1-u)+1/4*(y*u+(1-y)*(1-u))*y^2+1/4*(y*u+(1-y)*(1-u))*y*(1-u)
  q[5,3] <- (y*u+(1-y)*(1-u))*y*(1-u)
  q[5,4] <- 1/16*2*y*(1-y)*y^2+1/8*2*y*(1-y)*y*(1-u)+1/8*(y*u+(1-y)*(1-u))*y^2+1/16*2*y*(1-y)*(1-u)^2+
    1/16*2*u*(1-u)*y^2+1/4*(y*u+(1-y)*(1-u))*y*(1-u)+1/8*(y*u+(1-y)*(1-u))*(1-u)^2+1/8*y*(1-u)*2*u*(1-u)+
    1/16*2*u*(1-u)*(1-u)^2
  q[5,5] <- 1/4*(y*u+(1-y)*(1-u))*y*(1-u)+1/4*(y*u+(1-y)*(1-u))*(1-u)^2+1/4*y*(1-u)*2*u*(1-u)+
    1/4*2*u*(1-u)*(1-u)^2
  q[5,6] <- 2*u*(1-u)*(1-u)^2
  
  q[6,1] <- y^2*y^2
  q[6,2] <- 1/4*y^2*y^2+2*1/4*y^2*y*(1-u)+1/4*y*(1-u)*y*(1-u)
  q[6,3] <- y*(1-u)*y*(1-u)
  q[6,4] <- 1/16*y^2*y^2+2*1/8*y^2*y*(1-u)+2*1/16*y^2*(1-u)^2+1/4*y*(1-u)*y*(1-u)+2*1/8*y*(1-u)*(1-u)^2+1/16*(1-u)^2*(1-u)^2
  q[6,5] <- 1/4*y*(1-u)*y*(1-u)+2*1/4*y*(1-u)*(1-u)^2+1/4*(1-u)^2*(1-u)^2
  q[6,6] <- (1-u)^2*(1-u)^2
  
  
  nu <- rep(0, 6)
  
  nu[1] <- num[1,1]           #the number of sibships with genotype (AA,AA)
  nu[2] <- num[1,2] +num[2,1] #the number of sibships with genotype (AA, Aa) or (Aa, AA)
  nu[3] <- num[1,3] +num[3,1]
  nu[4] <- num[2,2]
  nu[5] <- num[2,3] +num[3,2]
  nu[6] <- num[3,3]
  
  nu
  
  
  m<- 100 # the number of iterations
  
  pt1<- matrix(0, m, 3)
  w<- matrix(0, nrow = 6, ncol = 6)
  pt1[1, ] <- c(0.1, 0.7, 0.2) # initial value
  t<- 1
  while (t<m)
  {
    
    for (i in 1:6)
    {
      
      w[i,1]<- pt1[t,1]*pt1[t,1]*q[i,1]
      w[i,2]<- 2*pt1[t,1]*pt1[t,2]*q[i,2]
      w[i,3]<- 2*pt1[t,1]*pt1[t,3]*q[i,3]
      w[i,4]<- pt1[t,2]*pt1[t,2]*q[i,4]
      w[i,5]<- 2*pt1[t,2]*pt1[t,3]*q[i,5]
      w[i,6]<- pt1[t,3]*pt1[t,3]*q[i,6]
      
    }
    
    w1<- rep(0, 6)
    
    for (i in 1:6)
    {
      w1[i]<- sum(w[i,])
    }
    
    w2<-rep(0, 6)
    for (i in 1:6)
    {
      w2[i]=nu[i]*(2*w[i,1]+w[i,2]+w[i,3])/w1[i]
    }
    
    p1<- sum(w2)
    
    w3<-rep(0, 6)
    for (i in 1:6)
    {
      w3[i]=nu[i]*(2*w[i,4]+w[i,2]+w[i,5])/w1[i]
    }
    p2<- sum(w3)
    
    w4<-rep(0, 6)
    for (i in 1:6)
    {
      w4[i]=nu[i]*(2*w[i,6]+w[i,3]+w[i,5])/w1[i]
    }
    
    p3<- sum(w4)
    
    s<-sum(p1+p2+p3)
    
    t= t+1
    
    pt1[t,]<- c(p1/s, p2/s, p3/s)
    
  }
  
  pt1[m,] # the estimate of genotype frequences
  
  ## EM algorithm for alelles frequences
  
  
  ma<- 100 # the number of iterations
  
  wa<- matrix(0, nrow = 6, ncol = 6)
  pt1a<- matrix(0, ma, 2)
  pt1a[1, ] <- c(0.1, 0.9) # initial value
  t<- 1
  while (t < ma)
  {
    
    
    for (i in 1:6)
    {
      
      wa[i,1]<- (pt1a[t,1]^2)*(pt1a[t,1]^2)*q[i,1]
      wa[i,2]<- 2*(pt1a[t,1]^2)*(2*pt1a[t,1]*pt1a[t,2])*q[i,2]
      wa[i,3]<- 2*(pt1a[t,1]^2)*(pt1a[t,2]^2)*q[i,3]
      wa[i,4]<- (2*pt1a[t,1]*pt1a[t,2])*(2*pt1a[t,1]*pt1a[t,2])*q[i,4]
      wa[i,5]<- 2*(2*pt1a[t,1]*pt1a[t,2])*(pt1a[t,2]^2)*q[i,5]
      wa[i,6]<- (pt1a[t,2]^2)*(pt1a[t,2]^2)*q[i,6]
      
    }
    
    w1a <- rep(0, 6)
    
    for (i in 1:6)
    {
      w1a[i] <- sum(wa[i,])
    }
    
    w2a<-rep(0, 6)
    for (i in 1:6) # i = gi
    {
      w2a[i]=nu[i]*(4*wa[i,1]+3*wa[i,2]+2*wa[i,3]+2*wa[i,4]+wa[i,5])/w1a[i]
    }
    
    p1a<- sum(w2a)
    
    w3a<-rep(0, 6)
    for (i in 1:6)
    {
      w3a[i]=nu[i]*(wa[i,2]+2*wa[i,3]+2*wa[i,4]+3*wa[i,5]+4*wa[i,6])/w1a[i]
    }
    p2a<- sum(w3a)
    
    sa<-sum(p1a+p2a)
    
    t= t+1
    
    pt1a[t,]<- c(p1a/sa, p2a/sa)
    
  }
  
  pt1a[ma,] # the estimate of alleles frequences
  
  ##### Likelihood ratio test
  
  ### caculate unconstrained log - likelihood function
  
  l <- matrix(0, nrow = 6, ncol = 6)
  
  for (i in 1:6)
  {
    
    l[i,1]<- pt1[m,1]*pt1[m,1]*q[i,1]
    l[i,2]<- 2*pt1[m,1]*pt1[m,2]*q[i,2]
    l[i,3]<- 2*pt1[m,1]*pt1[m,3]*q[i,3]
    l[i,4]<- pt1[m,2]*pt1[m,2]*q[i,4]
    l[i,5]<- 2*pt1[m,2]*pt1[m,3]*q[i,5]
    l[i,6]<- pt1[m,3]*pt1[m,3]*q[i,6]
    
  }
  
  l1<- rep(0, 6)
  
  for (i in 1:6)
  {
    l1[i]<- log(sum(l[i,]))
  }
  
  
  ln1 <- sum(nu*l1) # unconstrained log - likelihood function
  
  ### calculate log - likelihood function under HWE
  
  la <- matrix(0, nrow = 6, ncol = 6)
  
  for (i in 1:6)
  {
    
    la[i,1]<- (pt1a[ma,1]^2)*(pt1a[ma,1]^2)*q[i,1]
    la[i,2]<- 2*(pt1a[ma,1]^2)*(2*pt1a[ma,1]*pt1a[ma,2])*q[i,2]
    la[i,3]<- 2*(pt1a[ma,1]^2)*(pt1a[ma,2]^2)*q[i,3]
    la[i,4]<- (2*pt1a[ma,1]*pt1a[ma,2])*(2*pt1a[ma,1]*pt1a[ma,2])*q[i,4]
    la[i,5]<- 2*(2*pt1a[ma,1]*pt1a[ma,2])*(pt1a[ma,2]^2)*q[i,5]
    la[i,6]<- (pt1a[ma,2]^2)*(pt1a[ma,2]^2)*q[i,6]
    
  }
  
  l1a <- rep(0, 6)
  
  for (i in 1:6)
  {
    l1a[i] <- log(sum(la[i,]))
  }
  
  
  ln0 <- sum(nu*l1a) # constrained log - likelihood function 
  
  kai <- -2*(ln0 - ln1)
  
  pv  <- 1 - pchisq(kai, df = 1)
  
  return(pv)}