Generic ComputeEM with Ramsay (1975) acceleration
[openmx:openmx.git] / src / ComputeNR.cpp
1 /*
2  *  Copyright 2013 The OpenMx Project
3  *
4  *  Licensed under the Apache License, Version 2.0 (the "License");
5  *  you may not use this file except in compliance with the License.
6  *  You may obtain a copy of the License at
7  *
8  *       http://www.apache.org/licenses/LICENSE-2.0
9  *
10  *   Unless required by applicable law or agreed to in writing, software
11  *   distributed under the License is distributed on an "AS IS" BASIS,
12  *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  *  See the License for the specific language governing permissions and
14  *  limitations under the License.
15  */
16
17 #include <valarray>
18
19 #include "omxState.h"
20 #include "omxFitFunction.h"
21 #include "omxExportBackendState.h"
22 #include "Compute.h"
23 #include "matrix.h"
24
25 class ComputeNR : public omxCompute {
26         typedef omxCompute super;
27         omxMatrix *fitMatrix;
28
29         int maxIter;
30         double tolerance;
31         int inform, iter;
32         int verbose;
33         bool carefully;
34         std::vector<Ramsay1975*> ramsay;
35         double getMaxCaution() const;
36         void clearRamsay();
37
38 public:
39         ComputeNR();
40         virtual ~ComputeNR();
41         virtual void initFromFrontend(SEXP rObj);
42         virtual omxFitFunction *getFitFunction();
43         virtual void compute(FitContext *fc);
44         virtual void reportResults(FitContext *fc, MxRList *out);
45         virtual double getOptimizerStatus() { return inform; }  // backward compatibility
46 };
47
48 class omxCompute *newComputeNewtonRaphson()
49 {
50         return new ComputeNR();
51 }
52
53 ComputeNR::ComputeNR()
54 {
55         inform = 0;
56         iter = 0;
57 }
58
59 void ComputeNR::clearRamsay()
60 {
61         for (size_t rx=0; rx < ramsay.size(); ++rx) {
62                 delete ramsay[rx];
63         }
64         ramsay.clear();
65 }
66
67 ComputeNR::~ComputeNR()
68 {
69         clearRamsay();
70 }
71
72 void ComputeNR::initFromFrontend(SEXP rObj)
73 {
74         super::initFromFrontend(rObj);
75
76         fitMatrix = omxNewMatrixFromSlot(rObj, globalState, "fitfunction");
77         setFreeVarGroup(fitMatrix->fitFunction, varGroup);
78         omxCompleteFitFunction(fitMatrix);
79
80         if (!fitMatrix->fitFunction->hessianAvailable ||
81             !fitMatrix->fitFunction->gradientAvailable) {
82                 error("Newton-Raphson requires derivatives");
83         }
84
85         SEXP slotValue;
86         PROTECT(slotValue = GET_SLOT(rObj, install("maxIter")));
87         maxIter = INTEGER(slotValue)[0];
88
89         PROTECT(slotValue = GET_SLOT(rObj, install("tolerance")));
90         tolerance = REAL(slotValue)[0];
91         if (tolerance <= 0) error("tolerance must be positive");
92
93         PROTECT(slotValue = GET_SLOT(rObj, install("verbose")));
94         verbose = asInteger(slotValue);
95
96         PROTECT(slotValue = GET_SLOT(rObj, install("carefully")));
97         carefully = asLogical(slotValue);
98 }
99
100 omxFitFunction *ComputeNR::getFitFunction()
101 { return fitMatrix->fitFunction; }
102
103 void omxApproxInvertPosDefTriangular(int dim, double *hess, double *ihess, double *stress)
104 {
105         int info;
106         int retries = 0;
107         const int maxRetries = 31; // assume >=32 bit integers
108         double adj = 0;
109         do {
110                 memcpy(ihess, hess, sizeof(double) * dim * dim);
111
112                 if (retries >= 1) {
113                         int th = maxRetries - retries;
114                         if (th > 0) {
115                                 adj = 1.0/(1 << th);
116                         } else {
117                                 adj = (1 << -th);
118                         }
119                         for (int px=0; px < dim; ++px) {
120                                 ihess[px * dim + px] += adj;
121                         }
122                 }
123
124                 Matrix ihessMat(ihess, dim, dim);
125                 info = InvertSymmetricPosDef(ihessMat, 'L');
126                 if (info == 0) break;
127         } while (++retries < maxRetries * 1.5);
128
129         if (info > 0) {
130                 omxRaiseErrorf(globalState, "Hessian is not even close to positive definite (order %d)", info);
131                 return;
132         }
133
134         if (stress) *stress = adj;
135 }
136
137 void omxApproxInvertPackedPosDefTriangular(int dim, int *mask, double *packedHess, double *stress)
138 {
139         int mdim = 0;
140         for (int dx=0; dx < dim; ++dx) if (mask[dx]) mdim += 1;
141
142         std::vector<double> hess(mdim * mdim, 0.0);
143         for (int d1=0, px=0, m1=-1; d1 < dim; ++d1) {
144                 if (mask[d1]) ++m1;
145                 for (int d2=0, m2=-1; d2 <= d1; ++d2) {
146                         if (mask[d2]) ++m2;
147                         if (mask[d1] && mask[d2]) {
148                                 hess[m2 * mdim + m1] = packedHess[px];
149                         }
150                         ++px;
151                 }
152         }
153
154         std::vector<double> ihess(mdim * mdim);
155         omxApproxInvertPosDefTriangular(mdim, hess.data(), ihess.data(), stress);
156
157         for (int d1=0, px=0, m1=-1; d1 < dim; ++d1) {
158                 if (mask[d1]) ++m1;
159                 for (int d2=0, m2=-1; d2 <= d1; ++d2) {
160                         if (mask[d2]) ++m2;
161                         if (mask[d1] && mask[d2]) {
162                                 packedHess[px] = *stress? 0 : ihess[m2 * mdim + m1];
163                         }
164                         ++px;
165                 }
166         }
167 }
168
169 void pda(const double *ar, int rows, int cols);
170
171 double ComputeNR::getMaxCaution() const
172 {
173         double caution = 0;
174         for (size_t rx=0; rx < ramsay.size(); ++rx) {
175                 caution = std::max(caution, ramsay[rx]->maxCaution);
176         }
177         return caution;
178 }
179
180 void ComputeNR::compute(FitContext *fc)
181 {
182         // complain if there are non-linear constraints TODO
183
184         size_t numParam = varGroup->vars.size();
185         if (numParam <= 0) {
186                 error("Model has no free parameters");
187                 return;
188         }
189
190         OMXZERO(fc->flavor, numParam);
191         if (fitMatrix->fitFunction->parametersHaveFlavor) {
192                 omxFitFunctionCompute(fitMatrix->fitFunction, FF_COMPUTE_PARAMFLAVOR, fc);
193         }
194
195         if (1) { // add conditions to disable TODO
196                 fc->hgProd.resize(0);
197                 omxFitFunctionCompute(fitMatrix->fitFunction, FF_COMPUTE_HGPROD, fc);
198                 std::sort(fc->hgProd.begin(), fc->hgProd.end());
199                 std::vector<matrixVectorProdTerm>::iterator iter = std::unique(fc->hgProd.begin(), fc->hgProd.end());
200                 fc->hgProd.resize( std::distance(fc->hgProd.begin(), iter) );
201                 //fc->log("NR", FF_COMPUTE_HGPROD);
202         }
203
204         for (size_t px=0; px < numParam; ++px) {
205                 // global namespace for flavors? TODO
206                 if (fc->flavor[px] < 0 || fc->flavor[px] > 100) {  // max flavor? TODO
207                         error("Invalid parameter flavor %d", fc->flavor[px]);
208                 }
209                 while (int(ramsay.size()) < fc->flavor[px]+1) {
210                         const double minCaution = 0; // doesn't make sense to go faster
211                         Ramsay1975 *ram = new Ramsay1975(fc, int(ramsay.size()), fc->caution,
212                                                          verbose, minCaution);
213                         ramsay.push_back(ram);
214                 }
215         }
216
217         omxFitFunctionCompute(fitMatrix->fitFunction, FF_COMPUTE_PREOPTIMIZE, fc);
218         fc->maybeCopyParamToModel(globalState);
219
220         iter = 0;
221         int sinceRestart = 0;
222         bool converged = false;
223         bool approaching = false;
224         bool restarted = carefully || fc->caution >= .5;
225         double maxAdj = 0;
226         double maxAdjSigned = 0;
227         int maxAdjFlavor = 0;
228         int maxAdjParam = -1;
229         double bestLL = 0;
230
231         std::valarray<double> startBackup(fc->est, numParam);
232         std::vector<double> bestBackup(numParam);
233
234         fitMatrix->data[0] = 0;  // may not recompute it, don't leave stale data
235
236         if (verbose >= 2) {
237                 mxLog("Welcome to Newton-Raphson (tolerance %.3g, max iter %d, %ld flavors, %lu h-g terms)",
238                       tolerance, maxIter, ramsay.size(), fc->hgProd.size());
239         }
240         while (1) {
241                 if (verbose >= 2) {
242                         const char *pname = "none";
243                         if (maxAdjParam >= 0) pname = fc->varGroup->vars[maxAdjParam]->name;
244                         mxLog("Begin %d/%d iter of Newton-Raphson (prev maxAdj %.4f for %s, flavor %d)",
245                               iter+1, maxIter, maxAdjSigned, pname, maxAdjFlavor);
246                 }
247
248                 int want = FF_COMPUTE_GRADIENT|FF_COMPUTE_IHESSIAN;
249                 if (restarted) want |= FF_COMPUTE_FIT;
250
251                 OMXZERO(fc->grad, numParam);
252                 OMXZERO(fc->ihess, numParam * numParam);
253
254                 omxFitFunctionCompute(fitMatrix->fitFunction, want, fc);
255                 if (isErrorRaised(globalState)) break;
256
257                 if (verbose >= 5) {
258                         int show = FF_COMPUTE_ESTIMATE;
259                         if (verbose >= 6) show |= FF_COMPUTE_GRADIENT|FF_COMPUTE_IHESSIAN;
260                         fc->log("Newton-Raphson", show);
261                 }
262
263                 double LL = fitMatrix->data[0];
264                 if (bestLL == 0 || bestLL > LL) {
265                         bestLL = LL;
266                         memcpy(bestBackup.data(), fc->est, sizeof(double) * numParam);
267                 }
268                 if (bestLL > 0) {
269                         if (verbose >= 3) mxLog("Newton-Raphson cantankerous fit %.5f (best %.5f)", LL, bestLL);
270                         if (approaching && (LL > bestLL + 0.5 || !std::isfinite(LL))) {  // arbitrary threshold
271                                 memcpy(fc->est, bestBackup.data(), sizeof(double) * numParam);
272                                 fc->copyParamToModel(globalState);
273                                 break;
274                         }
275                 }
276
277                 if (verbose >= 1 || OMX_DEBUG) {
278                         for (size_t h1=1; h1 < numParam; h1++) {
279                                 for (size_t h2=0; h2 < h1; h2++) {
280                                         if (fc->ihess[h2 * numParam + h1] == 0) continue;
281                                         omxRaiseErrorf(globalState, "Inverse Hessian is not upper triangular");
282                                 }
283                         }
284                 }
285
286                 if (0) {
287                         const double maxDiag = 4;
288                         for (size_t dx=0; dx < numParam; dx++) {
289                                 double mag = fabs(fc->ihess[dx * numParam + dx]);
290                                 if (maxDiag < mag) {
291                                         double old = fc->grad[dx];
292                                         double logBad = log(1 + mag - maxDiag);
293                                         fc->grad[dx] /= (1 + logBad);  // arbitrary guess
294                                         mxLog("ihess bad at diag %lu grad %.8g -> %.8g", dx, old, fc->grad[dx]);
295                                 }
296                         }
297                         //fc->log("bad", FF_COMPUTE_IHESSIAN);
298                 }
299
300                 bool restart = false;
301                 if ((++sinceRestart) % 3 == 0) {
302                         for (size_t rx=0; rx < ramsay.size(); ++rx) {
303                                 ramsay[rx]->recalibrate(&restart);
304                         }
305                 }
306                 for (size_t px=0; px < numParam; ++px) {
307                         if (!std::isfinite(fc->grad[px])) {
308                                 if (!restart) {
309                                         if (verbose >= 3) {
310                                                 mxLog("Newton-Raphson: grad[%lu] not finite, restart recommended", px);
311                                         }
312                                         restart = true;
313                                         break;
314                                 }
315                         }
316                 }
317                 if ((sinceRestart % 3) == 0 && !restart) {
318                         // 3 iterations without extreme oscillation or NaN gradients
319                         if (!approaching && verbose >= 2) {
320                                 mxLog("Newton-Raphson: Probably approaching solution");
321                         }
322                         approaching = true;
323                 }
324
325                 maxAdjParam = -1;
326                 maxAdj = 0;
327                 if (restart && (!approaching || !restarted)) {
328                         approaching = false;
329                         restarted = true;
330                         bestLL = 0;
331                         sinceRestart = 0;
332
333                         if (verbose >= 2) {
334                                 mxLog("Newton-Raphson: Increasing damping ...");
335                         }
336                         for (size_t px=0; px < numParam; ++px) {
337                                 fc->est[px] = startBackup[px];
338                         }
339                         for (size_t rx=0; rx < ramsay.size(); ++rx) {
340                                 ramsay[rx]->restart();
341                         }
342                 } else {
343                         double *grad = fc->grad;
344                         double *ihess = fc->ihess;
345
346                         std::vector<double> move(numParam, 0.0);
347                         for (size_t px=0; px < fc->hgProd.size(); ++px) {
348                                 matrixVectorProdTerm &mvp = fc->hgProd[px];
349                                 move[mvp.dest] += ihess[mvp.hentry] * grad[mvp.gentry];
350                         }
351
352                         if (0) {
353                                 std::vector<double> blasMove(numParam); // uninit
354                                 const char uplo = 'U';
355                                 int dim = int(numParam);
356                                 int incx = 1;
357                                 double alpha = 1;
358                                 double beta = 0;
359                                 F77_CALL(dsymv)(&uplo, &dim, &alpha, fc->ihess, &dim,
360                                                 fc->grad, &incx, &beta, blasMove.data(), &incx);
361
362                                 double blasDiff = 0;
363                                 for (size_t px=0; px < numParam; ++px) {
364                                         blasDiff += fabs(move[px] - blasMove[px]);
365                                 }
366                                 mxLog("blasdiff %.10g", blasDiff);
367                                 if (blasDiff > 1) {
368                                         fc->log("N-R", FF_COMPUTE_GRADIENT|FF_COMPUTE_IHESSIAN|FF_COMPUTE_HGPROD);
369                                         pda(move.data(), 1, numParam);
370                                         pda(blasMove.data(), 1, numParam);
371                                         abort();
372                                 }
373                         }
374
375                         for (size_t px=0; px < numParam; ++px) {
376                                 Ramsay1975 *ramsay1 = ramsay[ fc->flavor[px] ];
377                                 double oldEst = fc->est[px];
378                                 double speed = 1 - ramsay1->caution;
379
380                                 ramsay1->recordEstimate(px, oldEst - speed * move[px]);
381
382                                 double badj = fabs(oldEst - fc->est[px]);
383                                 if (maxAdj < badj) {
384                                         maxAdj = badj;
385                                         maxAdjSigned = oldEst - fc->est[px];
386                                         maxAdjParam = px;
387                                         maxAdjFlavor = fc->flavor[px];
388                                 }
389                         }
390                         converged = maxAdj < tolerance * (1-getMaxCaution());
391                         //converged = maxAdj < tolerance;  // makes practically no difference
392                 }
393
394                 fc->copyParamToModel(globalState);
395
396                 R_CheckUserInterrupt();
397
398                 if (converged || ++iter >= maxIter) break;
399         }
400
401         fc->caution = getMaxCaution();
402         clearRamsay();
403
404         if (converged) {
405                 fc->inform = INFORM_CONVERGED_OPTIMUM;
406                 if (verbose >= 1) {
407                         mxLog("Newton-Raphson converged in %d cycles (max change %.12f, max caution %.4f)",
408                               iter, maxAdj, fc->caution);
409                 }
410         } else if (iter < maxIter) {
411                 fc->inform = INFORM_UNCONVERGED_OPTIMUM;
412                 if (verbose >= 1) {
413                         mxLog("Newton-Raphson not improving on %.6f after %d cycles (max caution %.4f)",
414                               bestLL, iter, fc->caution);
415                 }
416         } else if (iter == maxIter) {
417                 if (bestLL > 0) {
418                         fc->inform = INFORM_NOT_AT_OPTIMUM;
419                         memcpy(fc->est, bestBackup.data(), sizeof(double) * numParam);
420                         fc->copyParamToModel(globalState);
421                         if (verbose >= 1) {
422                                 mxLog("Newton-Raphson improved but fail to converge after %d cycles (max caution %.4f)",
423                                       iter, fc->caution);
424                         }
425                 } else {
426                         fc->inform = INFORM_ITERATION_LIMIT;
427                         if (verbose >= 1) {
428                                 mxLog("Newton-Raphson failed to converge after %d cycles (max caution %.4f)",
429                                       iter, fc->caution);
430                         }
431                 }
432         }
433
434         fc->iterations = iter;
435 }
436
437 void ComputeNR::reportResults(FitContext *fc, MxRList *out)
438 {
439         if (Global->numIntervals) {
440                 warning("Confidence intervals are not implemented for Newton-Raphson");
441         }  
442
443         omxPopulateFitFunction(fitMatrix, out);
444
445         size_t numFree = varGroup->vars.size();
446
447         SEXP estimate;
448         PROTECT(estimate = allocVector(REALSXP, numFree));
449         memcpy(REAL(estimate), fc->est, sizeof(double) * numFree);
450
451         out->push_back(std::make_pair(mkChar("minimum"), ScalarReal(fc->fit)));
452         out->push_back(std::make_pair(mkChar("Minus2LogLikelihood"), ScalarReal(fc->fit)));
453         out->push_back(std::make_pair(mkChar("estimate"), estimate));
454
455         SEXP iterations;
456
457         // SEXP code;
458         // PROTECT(code = NEW_NUMERIC(1));
459         // REAL(code)[0] = inform;
460         // out->push_back(std::make_pair(mkChar("nr.code"), code));
461
462         PROTECT(iterations = NEW_NUMERIC(1));
463         REAL(iterations)[0] = iter;
464         out->push_back(std::make_pair(mkChar("nr.iterations"), iterations));
465 }