Advanced Computing Platform for Theoretical Physics

Commit 14ff1ee9 authored by Justin Bayer's avatar Justin Bayer
Browse files

Revert "Removed all traces of tonga and natural newton."

This reverts commit 6932388c.
parent 6932388c
......@@ -11,3 +11,6 @@ from sbfgs import Sbfgs
from smd import Smd
from nesterov import Nesterov
from asgd import Asgd
from tonga import tonga
from nn import NaturalNewton
from nn_lowrank import NaturalNewtonLR
# -*- coding: utf-8 -*-
import numpy as np
import scipy
import scipy.linalg
import scipy.optimize
from scipy.sparse.linalg import eigsh
from base import Minimizer, is_nonzerofinite
from hf import HessianFree
from linesearch import WolfeLineSearch, BackTrack
###TODO: not copy code from LBFGS ###
class NaturalNewton(Minimizer):
"""
Working version, but not performing better than lbfgs
Diagonal approximation of the covariance matrix
No search on the hyperparameters space was performed
"""
def __init__(self, wrt, f, fprime,
initial_hessian_diag=1,
n_factors=10, line_search=None,
args=None,
countC=8, N=1,
gamma=0.995, BC=2,
logfunc=None):
super(NaturalNewton, self).__init__(wrt, args=args, logfunc=logfunc)
self.f = f
self.fprime = fprime
self.countC = countC
self.gamma = gamma
self.BC = BC
self.N = N
#for LBFGS#
self.initial_hessian_diag = initial_hessian_diag
self.n_factors = n_factors
if line_search is not None:
self.line_search = line_search
else:
self.line_search = WolfeLineSearch(wrt, self.f, self.fprime)
##################################################################
### unused functions for now, potentially helpful for low-rank approximation###
def min_(self, B, M):
V , U = eigsh(M, k=self.k, ncv=(2*self.k+2))
return scipy.dot(scipy.minimum(V,B)*(U.T), U)
## ret = [min(V[i], B) * scipy.outer(U[i], U[i]) for i in range(self.k)]
## return np.array(ret).sum(axis=0)
def invA (self, u, v, X, fA_m1):
Au = fA_m1(u)
return (fA_m1(X)-scipy.dot(fA_m1(v), X)/(1+scipy.dot(v,Au)) * Au)
##################################################################
### LBFGS ###
def find_direction(self, grad_diffs, steps, grad, hessian_diag, idxs):
grad = grad.copy() # We will change this.
n_current_factors = len(idxs)
# TODO: find a good name for this variable.
rho = scipy.empty(n_current_factors)
# TODO: vectorize this function
for i in idxs:
rho[i] = 1 / scipy.inner(grad_diffs[i], steps[i])
# TODO: find a good name for this variable as well.
alpha = scipy.empty(n_current_factors)
for i in idxs[::-1]:
alpha[i] = rho[i] * scipy.inner(steps[i], grad)
grad -= alpha[i] * grad_diffs[i]
z = hessian_diag * grad
# TODO: find a good name for this variable (surprise!)
beta = scipy.empty(n_current_factors)
for i in idxs:
beta[i] = rho[i] * scipy.inner(grad_diffs[i], z)
z += steps[i] * (alpha[i] - beta[i])
return z, {}
##################################################################
def __iter__(self):
args, kwargs = self.args.next()
grad = self.fprime(self.wrt, *args, **kwargs)
mu = grad
mu_norm = (mu**2).sum()
cov = np.zeros(mu.size)
D = np.ones(mu.size)
### LBFGS ###
grad_m1 = scipy.zeros(grad.shape)
factor_shape = self.n_factors, self.wrt.shape[0]
grad_diffs = scipy.zeros(factor_shape)
steps = scipy.zeros(factor_shape)
hessian_diag = self.initial_hessian_diag
step_length = None
idxs = []
##################################################################
for i, (next_args, next_kwargs) in enumerate(self.args):
### LBFGS ###
if i == 0:
direction = -grad
info = {}
else:
sTgd = scipy.inner(step, grad_diff)
if sTgd > 1E-10:
# Don't do an update if this value is too small.
# Determine index for the current update.
if not idxs:
# First iteration.
this_idx = 0
elif len(idxs) < self.n_factors:
# We are not "full" yet. Thus, append the next idxs.
this_idx = idxs[-1] + 1
else:
# we are full and discard the first index.
this_idx = idxs.pop(0)
idxs.append(this_idx)
grad_diffs[this_idx] = grad_diff
steps[this_idx] = step
hessian_diag = sTgd / scipy.inner(grad_diff, grad_diff)
direction, info = self.find_direction(
grad_diffs, steps, -grad, hessian_diag, idxs)
if not is_nonzerofinite(direction):
self.logfunc(
{'message': 'direction is invalid--need to bail out.'})
break
##################################################################
# Update parameters.
step = D * direction
self.wrt += step
if (i%self.countC == 0) and (i>0):
cov = self.gamma * cov + self.gamma*(1-self.gamma)*(direction - mu)**2
mu = self.gamma*mu + (1-self.gamma)*direction
mu_norm = (mu**2).sum()
D = 1/(1 + scipy.minimum(self.BC, cov/(self.N * mu_norm)))
#D = scipy.linalg.inv(np.eye(mu.len())+ self.min_(self.BC, cov)/(self.N*(mu**2).sum()))
## fact = self.N*(mu**2).sum()
## lamb, V = eigsh(cov, k=self.k, ncv=(2*self.k+2))
## U = scipy.minimum(lamb/fact, B)*(V.T)
## D = lambda w: w
## for j in range(self.k):
## D = lambda X: self.invA(u[j], v[j], X, D)
loss = self.f(self.wrt, *next_args, **next_kwargs)
info = {
'n_iter': i,
'loss': loss,
'direction':direction,
'gradient': grad,
'mu':mu,
'cov':cov,
'step': step,
'args': args,
'kwargs': kwargs}
yield info
# Prepare for next loop.
args, kwargs = next_args, next_kwargs
grad_m1[:], grad[:] = grad, self.fprime(self.wrt, *args, **kwargs)
grad_diff = grad - grad_m1
# -*- coding: utf-8 -*-
import numpy as np
import scipy
import math
import scipy.linalg
import scipy.optimize
from scipy.sparse.linalg import eigsh
from base import Minimizer, is_nonzerofinite
from hf import HessianFree
from linesearch import WolfeLineSearch, BackTrack
from scipy.sparse.linalg import eigsh
###TODO: not copy code from LBFGS ###
class NaturalNewtonLR(Minimizer):
"""
Unstable
"""
def __init__(self, wrt, f, fprime, blocksizes,
initial_hessian_diag=1,
n_factors=10, line_search=None,
args=None,
countC=8, N=1,
gamma=0.995, BC=2,
b=50, k=5,
logfunc=None):
super(NaturalNewtonLR, self).__init__(wrt, args=args, logfunc=logfunc)
self.f = f
self.fprime = fprime
self.blocksizes = blocksizes
self.countC = countC
self.gamma = gamma
self.gamma_sqrt = math.sqrt(self.gamma)
self.gamma_sqrt_m1 = math.sqrt(1-self.gamma)
self.BC = BC
self.N = N
self.b = b #TODO find a meaningful name
self.k = k
#for LBFGS#
self.initial_hessian_diag = initial_hessian_diag
self.n_factors = n_factors
if line_search is not None:
self.line_search = line_search
else:
self.line_search = WolfeLineSearch(wrt, self.f, self.fprime)
##################################################################
### LBFGS ###
def find_direction(self, grad_diffs, steps, grad, hessian_diag, idxs):
grad = grad.copy() # We will change this.
n_current_factors = len(idxs)
# TODO: find a good name for this variable.
rho = scipy.empty(n_current_factors)
# TODO: vectorize this function
for i in idxs:
rho[i] = 1 / scipy.inner(grad_diffs[i], steps[i])
# TODO: find a good name for this variable as well.
alpha = scipy.empty(n_current_factors)
for i in idxs[::-1]:
alpha[i] = rho[i] * scipy.inner(steps[i], grad)
grad -= alpha[i] * grad_diffs[i]
z = hessian_diag * grad
# TODO: find a good name for this variable (surprise!)
beta = scipy.empty(n_current_factors)
for i in idxs:
beta[i] = rho[i] * scipy.inner(grad_diffs[i], z)
z += steps[i] * (alpha[i] - beta[i])
return z, {}
##################################################################
def __iter__(self):
args, kwargs = self.args.next()
grad = self.fprime(self.wrt, *args, **kwargs)
mu = grad
mu_norm = (mu**2).sum()
G_m1 = np.zeros((mu.size, 1))
### LBFGS ###
grad_m1 = scipy.zeros(grad.shape)
factor_shape = self.n_factors, self.wrt.shape[0]
grad_diffs = scipy.zeros(factor_shape)
steps = scipy.zeros(factor_shape)
hessian_diag = self.initial_hessian_diag
step_length = None
idxs = []
##################################################################
for i, (next_args, next_kwargs) in enumerate(self.args):
### LBFGS ###
if i == 0:
direction = -grad
info = {}
else:
sTgd = scipy.inner(step, grad_diff)
if sTgd > 1E-10:
# Don't do an update if this value is too small.
# Determine index for the current update.
if not idxs:
# First iteration.
this_idx = 0
elif len(idxs) < self.n_factors:
# We are not "full" yet. Thus, append the next idxs.
this_idx = idxs[-1] + 1
else:
# we are full and discard the first index.
this_idx = idxs.pop(0)
idxs.append(this_idx)
grad_diffs[this_idx] = grad_diff
steps[this_idx] = step
hessian_diag = sTgd / scipy.inner(grad_diff, grad_diff)
direction, info = self.find_direction(
grad_diffs, steps, -grad, hessian_diag, idxs)
if not is_nonzerofinite(direction):
self.logfunc(
{'message': 'direction is invalid--need to bail out.'})
break
##################################################################
# Update parameters.
# Similar to tonga, except that the direction returned by lbfgs (ie the gradient times the inverse of the Hessian) is used instead of the gradient
# mu is the running estimate of the mean of Newton directions
if (i%self.b == 0) and (i>0):
G = np.empty((mu.size, self.k))
offset = 0
#block diagonal approximation of eigenvectors
for size in self.blocksizes:
g = G_m1[offset:offset+size]
lamb, V = eigsh(scipy.dot(g, g.T), k=self.k, ncv=(2*self.k+2))
G[offset:offset+size] = np.sqrt(scipy.minimum(self.BC, lamb/(self.N * mu_norm)))*V
## lamb, V = eigsh(scipy.dot(g.T, g), k=self.k, ncv=(2*self.k+2))
## U[offset:offset+size] = np.sqrt(minimum(self.BC, lamb/(self.N * mu_norm))) * scipy.dot(g, V*np.sqrt(lamb))
offset += size
y = np.zeros(self.k)
y[-1] = 1
size = self.k
else :
size = G_m1[0].size+1
G = np.empty((mu.size, size))
G[:,:-1] = self.gamma_sqrt * G_m1[:,:]
G[:,-1] = self.gamma_sqrt * self.gamma_sqrt_m1 * (direction - mu)
y = np.zeros(size)
y[-1] = self.gamma_sqrt * self.gamma_sqrt_m1
step = scipy.dot(scipy.dot(G, scipy.linalg.inv(np.eye(size) + scipy.dot(G.T,G)/(self.N * mu_norm))), (y - scipy.dot(G.T, mu) / (self.N * mu_norm))) + mu
mu = self.gamma * mu + (1-self.gamma)*direction
mu_norm = (mu**2).sum()
self.wrt += step
loss = self.f(self.wrt, *next_args, **next_kwargs)
info = {
'n_iter': i,
'loss': loss,
'direction':direction,
'gradient': grad,
'mu':mu,
'G':G,
'step': step,
'args': args,
'kwargs': kwargs}
yield info
# Prepare for next loop.
args, kwargs = next_args, next_kwargs
grad_m1[:], grad[:] = grad, self.fprime(self.wrt, *args, **kwargs)
grad_diff = grad - grad_m1
G_m1 = G
# -*- coding: utf-8 -*-
import scipy
import numpy as np
import scipy.linalg
import scipy.optimize
import itertools
import math
from scipy.sparse.linalg import eigsh
from base import Minimizer, repeat_or_iter
class tonga(Minimizer):
"""
Best hyper-parameters for TONGA :
- number of hidden neurons : 298
- value of gamma : 0,9429783274
- damping : 2,67E-02
- nb_estimates : 203
- batch size : 2961
- cov_batch_size : 152
- variance of weights initialization : 3,80E-02
Second best hyper-parameters:
- number of hidden neurons : 259
- value of gamma : 0,9007389073
- damping : 3,70E-02
- nb_estimates : 186
- batch size : 4875
- cov_batch_size : 3
- variance of weights initialization : 6,18E-03
"""
def __init__(self, wrt, fprime, damping, blocksizes,
gamma=0.995, b=50, k=5, nb_estimates=50,
args=None, cov_args=None, logfunc=None):
super(tonga, self).__init__(
wrt, args=args, logfunc=logfunc)
self.cov_args = cov_args if cov_args is not None else self.args
self.fprime = fprime
self.damping = damping
self.blocksizes = blocksizes
self.gamma = gamma
self.gamma_sqrt = math.sqrt(self.gamma)
self.b = b #TODO find a meaningful name
#b is the number of iterations we let the rank of the approximate covariance grows
#before we use an eigen decomposition again
self.k = k
#k is the number of eigenvectors we keep (rank of the approximation)
self.nb_estimates = nb_estimates
def __iter__(self):
X_m1 = np.zeros((1, self.wrt.size))
oldGrad = np.zeros((self.b -1, self.wrt.size))
step = scipy.empty(self.wrt.size)
for i, (args, kwargs) in enumerate(self.args):
offset = 0
if (i==0):
gradient_mean = self.fprime(self.wrt, *args, **kwargs)
X = scipy.empty((self.wrt.size,1))
X[:,0] = gradient_mean
elif ((i%self.b)!=0):
gradient_mean = self.fprime(self.wrt, *args, **kwargs)
X = scipy.empty((self.wrt.size, X_m1[0].size + 1))
X[:,:-1] = X_m1 * self.gamma_sqrt
X[:,-1] = gradient_mean
else:
batches = [self.cov_args.next() for _ in range(self.nb_estimates)]
gradient = [self.fprime(self.wrt, *cov_args, **cov_kwargs) for (cov_args, cov_kwargs) in batches]
gradient = np.array(gradient)
gradient_mean = gradient.mean(axis=0)
X = scipy.empty((self.wrt.size, self.k + self.b))
length = len(X[0])
for size in self.blocksizes:
if (i%self.b==0) and (i>0):
#computing gradients
grad = gradient[:, offset:offset+size]
# using old gradients to estimate the new X
factor = [self.gamma_sqrt**power for power in range(self.b-1,0,-1)]
X[offset:offset+size, self.k:self.k+self.b-1] = factor * oldGrad.T[offset:offset+size]
X[offset:offset+size, self.k+self.b-1] = self.gamma_sqrt**(self.b+i) * gradient_mean[offset:offset+size]
#eigenvalues decomposition
covariance = scipy.dot(grad.T, grad)
try:
_, X[offset:offset+size, :self.k] = eigsh(covariance, k=self.k, ncv=(2*self.k+2))
except scipy.sparse.linalg.ArpackError :
print ("ARPACK error 3: No shifts could be applied during a cycle of the Implicitly restarted Arnoldi iteration. One possibility is to increase the size of NCV relative to NEV")
fileLog = open('logError', 'a')
for j in range(grad.len):
for l in range(grad.len):
fileLog.write(str(covariance[i,j]))
fileLog.write('\n')
fileLog.close()
X[offset:offset+size, :self.k] = np.zeros((size, self.k))
x = X[offset:offset+size]
#step[offset:offset+size] = scipy.dot(x, scipy.linalg.inv(scipy.dot(x.T,x)+ self.damping*scipy.eye(length)))[:,-1]
step[offset:offset+size] = (scipy.linalg.solve((scipy.dot(x.T,x)+ self.damping*scipy.eye(length)).T, x.T, sym_pos=True, overwrite_a=True))[-1]
offset += size
#storing the old gradients
if ((i%self.b)==0) and (i>0):
oldGrad = np.zeros((self.b -1, self.wrt.size))
else:
oldGrad[i%self.b-1] = gradient_mean
self.wrt -= step
X_m1 = X
yield {
'gradient':gradient_mean,
'args':args,
'kwargs':kwargs,
'n_iter':i,
'step':step,
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment