/* Analytical computation of the hessian
   Does not  correctly work for the softmax/entropy */


/* To check the analytical 2nd derivatives, computation of the 
   numerical 2nd derivatives is provided, c.f. "NRC, 186-189". 
   This is mixed style C/C++. */

#ifdef NUM_CHECK
static long icom, jcom;  // communication variables for dfridr
static double *wscom, *dwcom;

double num_sec_deriv (double arg)  // used by dfridr
{
  double temp;

  temp = wscom[jcom];
  wscom[jcom] = arg;
  ffnet_derr_func (wscom, dwcom);
  wscom[jcom] = temp;
  return dwcom[icom];
}
#endif

mat ffnet::hess (const mat& x, const mat& y, const intset& fixed_weights, 
		 int objective_function, int regularizer, double reg_control)
  // compute hessian matrix of given objective function at the current weight vector
{
  mat h(w.rows(),w.rows());

#ifdef CHECK
  if ((x.columns() != nin) || (y.columns() != nout) || (x.rows() != y.rows()))
    RTerror ("neuro.cc", "mat ffnet::hess (const mat& x, const mat& y,"
	     "const intset& fixed_weights, int objective_function, int regularizer,"
	     "double reg_control)", "1");
#endif  
  long i, ip, j, jp, j_temp, jp_temp, n, k, kp, k_temp, kp_temp, kc_temp, kpc_temp; 
  double sum, temp1, temp2, temp3;
  vec hid (nhid), ynet (nout), delta(x.rows()), H(x.rows());
  long nhidout = (nin+1)*nhid;
  long nshortc = (nin+1)*nhid+(nhid+1)*nout;

  temp1 = temp2 = temp3 = 0.0;
  if (regularizer == NOR)  // no regularizer term
  {
    for (i=1; i<=w.rows(); i++)
      for (j=1; j<=w.rows(); j++)
	h(i,j) = 0.0;
  }
  else if (regularizer == WDR)  // weight decay regularizer
  {
    for (i=1; i<=w.rows(); i++)
    {
      for (j=1; j<=w.rows(); j++)
	h(i,j) = 0.0;
      h(i,i) = 2.0*reg_control;      
    }
  }
  else if (regularizer == LASSO)  // lasso regularizer 
  {
    for (i=1; i<=w.rows(); i++)
      for (j=1; j<=w.rows(); j++)
	h(i,j) = 0.0;
  }
  else
    RTerror ("neuro.cc", "mat ffnet::hess (const mat& x, const mat& y,"
	     "const intset& fixed_weights, int objective_function, int regularizer,"
	     "double reg_control)", "2");
  /*
    Hessian algorithm, c.f. "NNPR, 154-158". First, only the upper
    triangle of the Hessian is computed. Computation of the lower triangle is at the end.
    The index notation is as similar as possible to the book. However the weight index 
    has to be converted from the matrix index in the book to the vector index here.
  */
  for (n=1; n<=x.rows(); n++)  // loop over observations
  {
    forward_propagate (&(x.elem())[n][0], hid.elem(), ynet.elem(), w.elem(), 
		       nin, nhid, nout, hid_t, out_t, shortc);   
    for (k=1; k<=nout; k++)  /* both weights are in the hidden to output layer or shortcuts, 
				c.f. "NNPR, 157, (4.82)" */
    { 
      if ((objective_function == SSE) && (out_t == LIN))  // compute delta[k] and H[k][k], used later
      {
	delta(k) = 2.0*(ynet(k)-y(n,k));
	H(k) = 2.0;
      }
      else if ((objective_function == ENTROPY) && (out_t == SOFT))
      {
	delta(k) = (ynet(k)-y(n,k)); 
	H(k) = (ynet(k)-DSQR(ynet(k)));
      }
      else if (((objective_function == MAD) && (out_t == LIN)))
      {
	delta(k) = SIGN(1.0,ynet(k)-y(n,k));
	H(k) = 0.0;
      }
      k_temp = (k-1)*(nhid+1)+1;  // used for our vector index
      kc_temp = (k-1)*nin;  // vector index
      kp = k;  // Kronecker delta, i.e., only for kp equal k, (4.82) is not equal zero
      kp_temp = (kp-1)*(nhid+1)+1;  // vector index
      kpc_temp = (kp-1)*nin;  // vector index
      h(nhidout+k_temp,nhidout+kp_temp) += H(k);  // both weights are bias
      for (jp=1; jp<=nhid; jp++)  // one bias
	h(nhidout+k_temp,nhidout+kp_temp+jp) += H(k)*hid(jp); 
      for (j=1; j<=nhid; j++)  // no bias
	for (jp=j; jp<=nhid; jp++)	  
	  h(nhidout+k_temp+j,nhidout+kp_temp+jp) += H(k)*hid(j)*hid(jp);
      if (shortc)  // there are shortcuts
      {
	for (j=1; j<=nin; j++)  // both shortcuts
	  for (jp=j; jp<=nin; jp++)	
	    h(nshortc+kc_temp+j,nshortc+kpc_temp+jp) += H(k)*x(n,j)*x(n,jp);	  
	for (jp=1; jp<=nin; jp++)  // one shortcut 
	{
	  h(nhidout+k_temp,nshortc+kpc_temp+jp) += H(k)*x(n,jp);  // one bias 
	  for (j=1; j<=nhid; j++)  // no bias 
	    h(nhidout+k_temp+j,nshortc+kpc_temp+jp) += H(k)*hid(j)*x(n,jp);
	}
      }
    }
    for (j=1; j<=nhid; j++)  /* both weights in the input to hidden layer
				c.f. "NNPR, 157, (4.83)" */
    { 
      j_temp = (j-1)*(nin+1)+1;
      if (hid_t == SIG)  // compute 1st derivative of hidden activation function (HAF)
	temp1 = hid(j)*(1.0-hid(j));  
      else if (hid_t == TAN)
	temp1 = (1.0-DSQR(hid(j)));
      else
	RTerror ("neuro.cc", "mat ffnet::hess (const mat& x, const mat& y,"
		 "const intset& fixed_weights, int objective_function, int regularizer,"
		 "double reg_control)", "3");
      for (jp=1; jp<=nhid; jp++)  
      {
	jp_temp = (jp-1)*(nin+1)+1;
	if (hid_t == SIG)  // compute 1st and 2nd derivative of HAF
	{
	  temp2 = hid(jp)*(1.0-hid(jp));
	  temp3 = temp2*(1.0-2.0*hid(jp)); 
	}
	else if (hid_t == TAN)
	{
	  temp2 = (1.0-DSQR(hid(jp)));
	  temp3 = (-2.0)*hid(jp)*temp2;
	}
	else
	  RTerror ("neuro.cc", "mat ffnet::hess (const mat& x, const mat& y,"
		   "const intset& fixed_weights, int objective_function, int regularizer,"
		   "double reg_control)", "4");
	if (j == jp)
	{
	  sum = 0.0;
	  for (k=1; k<=nout; k++)  // sum of the 1st term from (4.83)
	  {
	    k_temp = (k-1)*(nhid+1)+1;
	    sum += w(nhidout+k_temp+jp)*delta(k);
	  }	
	  h(j_temp,jp_temp) += temp3*sum;  // compute the whole 1st term of (4.83) and save it
	  for (ip=1; ip<=nin; ip++)	
	    h(j_temp,jp_temp+ip) += x(n,ip)*temp3*sum;
	  for (i=1; i<=nin; i++)
	    for (ip=i; ip<=nin; ip++)	
	      h(j_temp+i,jp_temp+ip) += x(n,i)*x(n,ip)*temp3*sum;
	}
	sum = 0.0;
	for (k=1; k<=nout; k++)  // sum of the 2nd term from (4.83)
	{
	  k_temp = (k-1)*(nhid+1)+1;
	  sum += w(nhidout+k_temp+jp)*w(nhidout+k_temp+j)*H(k);
	}
	/* add the whole 2nd term to the already saved 1st and
	   compute only the upper triangle */
	if (j_temp <= jp_temp) h(j_temp,jp_temp) += temp1*temp2*sum;  
	for (i=1; i<=IMIN(jp_temp-j_temp,nin); i++)  
	  h(j_temp+i,jp_temp) += x(n,i)*temp1*temp2*sum;
	for (ip=IMAX(j_temp-jp_temp,1); ip<=nin; ip++)	
	  h(j_temp,jp_temp+ip) += x(n,ip)*temp1*temp2*sum;
	for (i=1; i<=nin; i++)
	  for (ip=IMAX(j_temp-jp_temp+i,1); ip<=nin; ip++)	
	    h(j_temp+i,jp_temp+ip) += x(n,i)*x(n,ip)*temp1*temp2*sum;
      }
    }
    for (j=1; j<=nhid; j++)  /* one weight in the input to hidden and the other 
				in the hidden to output layer or a shortcut, 
				c.f. "NNPR, 158, (4.84)" */
    { 
      j_temp = (j-1)*(nin+1)+1;
      if (hid_t == SIG)  // compute 1st derivative of HAF
	temp1 = hid(j)*(1.0-hid(j));  
      else if (hid_t == TAN)
	temp1 = (1.0-DSQR(hid(j)));
      else
	RTerror ("neuro.cc", "mat ffnet::hess (const mat& x, const mat& y,"
		 "const intset& fixed_weights, int objective_function, int regularizer,"
		 "double reg_control)", "5");
      for (k=1; k<=nout; k++)  
      { 
	k_temp = (k-1)*(nhid+1)+1;
	kc_temp = (k-1)*nin;
	h(j_temp,nhidout+k_temp) += temp1*w(nhidout+k_temp+j)*H(k);  // both bias
	for (jp=1; jp<=nhid; jp++)  // bias in the input to hidden layer, i.e., i == 0
	{
	  if (j == jp)  // Kronecker delta == 1, i.e., compute whole term of (4.84)
	    h(j_temp,nhidout+k_temp+jp) += temp1*(delta(k)+hid(jp)*w(nhidout+k_temp+j)*H(k));
	  else  // compute only second term in (4.84)
	    h(j_temp,nhidout+k_temp+jp) += temp1*hid(jp)*w(nhidout+k_temp+j)*H(k);
	}
	/* bias in the hidden to output layer, i.e., jp == 0. Kronecker delta == 0, 
	   i.e., compute only second term in (4.84) */
	for (i=1; i<=nin; i++)  
	  h(j_temp+i,nhidout+k_temp) += x(n,i)*temp1*w(nhidout+k_temp+j)*H(k);  
	for (i=1; i<=nin; i++)  // no bias
	{
	  for (jp=1; jp<=nhid; jp++)
	  {
	    if (j == jp)  // Kronecker delta == 1, i.e., compute whole term of (4.84)
	      h(j_temp+i,nhidout+k_temp+jp) += x(n,i)*temp1*(delta(k)+hid(jp)*w(nhidout+k_temp+j)*H(k));
	    else  // compute only second term in (4.84)
	      h(j_temp+i,nhidout+k_temp+jp) += x(n,i)*temp1*hid(jp)*w(nhidout+k_temp+j)*H(k);
	  }
	}
	if (shortc)  // there are shortcuts
	{
	  /* Bias in the input to hidden layer, i.e., i == 0.
	     Compute only second term in (4.84), since Kronecker delta == 0 */
	  for (jp=1; jp<=nin; jp++)  
	    h(j_temp,nshortc+kc_temp+jp) += temp1*x(n,jp)*w(nhidout+k_temp+j)*H(k);
	  for (i=1; i<=nin; i++)  // no bias
	  {
	    // Compute only second term in (4.84), since Kronecker delta == 0
	    for (jp=1; jp<=nin; jp++)
	      h(j_temp+i,nshortc+kc_temp+jp) += x(n,i)*temp1*x(n,jp)*w(nhidout+k_temp+j)*H(k);
	  }
	}
      }
    }
  }
  for (i=1; i<=w.rows(); i++)  // compute lower triangle
    for (j=i; j<=w.rows(); j++)
      h(j,i) = h(i,j);
#ifdef NUM_CHECK
  // This is mixed C/C++ code
  const double step = 1.0e-1;  // Initial stepsize, c.f. "NRC, 186-189"
  const double tol = 1.0e-5;  /* tolerance for the relative error between numerical 
				 and analytical 2nd derivative */

  double df, err;
  intset& fixed_weights_nc = const_cast <intset&> (fixed_weights);  // non-const reference
    
  ffnet_h.ynet = dvector (1, nout); ffnet_h.hid = dvector (1, nhid);  // set handler values
  ffnet_h.nin = nin; ffnet_h.nhid = nhid; ffnet_h.nout = nout; ffnet_h.nw = w.rows(); 
  ffnet_h.objective_function = objective_function; ffnet_h.regularizer = regularizer; 
  ffnet_h.hid_t = hid_t; ffnet_h.out_t = out_t; ffnet_h.shortc = shortc; 
  ffnet_h.reg_control = reg_control;
  ffnet_h.set = &fixed_weights_nc; 
  ffnet_h.x = x.elem(); ffnet_h.y = y.elem(); 
  ffnet_h.nr = x.rows();
  wscom = w.elem(); dwcom = dvector (1, ffnet_h.nw);
  for (i=1; i<=w.rows(); i++)
  {
    icom = i;
    for (j=1; j<=w.rows(); j++)
    {
      jcom = j;
      df = dfridr (num_sec_deriv, w(j), step, &err);  // compute numerical derivative
      if (fabs(h(i,j)-df)/df > tol)
      {
	cout << "...WARNING..." << endl;
	cout << "...Analytical 2nd derivative " << i << " " << j << " is ";
	cout << h(i,j) << "..." << endl;
	cout << "...Numerical 2nd derivative " << i << " " << j << " is ";
	cout << df << "..." << endl;
	cout << "...Relative error between numerical and analytical 2nd derivative is ";
	cout << fabs(h(i,j)-df)/df << "..." << endl;
	cout << "...Absolute error between numerical and analytical 2nd derivative is ";
	cout << fabs(h(i,j)-df) << "..." << endl;
	cout << "...Accuracy of numerical 2nd derivative is " << err << "..." << endl;
      }
    }
  }
  free_dvector (ffnet_h.hid, 1, nhid); free_dvector (ffnet_h.ynet, 1, nout); 
  free_dvector (dwcom, 1, ffnet_h.nw);
#endif
  if (!fixed_weights.empty())  // set 2nd derivatives of fixed weights to zero
    for (i=1; i<=w.rows(); i++)
      for (j=1; j<=w.rows(); j++)
	if ((fixed_weights.member(i)) || (fixed_weights.member(j))) h(i,j) = 0.0;
  return h;  // inefficient, 2 copies of h!
}
