Fixed issue with SVD on some objects.

Updated the LevMar library to V2.4. Pushed version number to 2.9.7.

Fixed issue with SVD on some objects.
Updated the LevMar library to V2.4. Pushed version number to 2.9.7.
534cebef · Emmanuel Bertin · 882ad557 · 534cebef · 534cebef · 534cebef
Commit 534cebef authored Oct 22, 2009 by Emmanuel Bertin
--- a/configure
+++ b/configure
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.63 for sextractor 2.9.6.
+# Generated by GNU Autoconf 2.63 for sextractor 2.9.7.
 #
 # Report bugs to <bertin@iap.fr>.
 #
@@ -750,8 +750,8 @@ SHELL=${CONFIG_SHELL-/bin/sh}
 # Identity of this package.
 PACKAGE_NAME='sextractor'
 PACKAGE_TARNAME='sextractor'
-PACKAGE_VERSION='2.9.6'
-PACKAGE_STRING='sextractor 2.9.6'
+PACKAGE_VERSION='2.9.7'
+PACKAGE_STRING='sextractor 2.9.7'
 PACKAGE_BUGREPORT='bertin@iap.fr'
  
 ac_unique_file="src/makeit.c"
@@ -1505,7 +1505,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures sextractor 2.9.6 to adapt to many kinds of systems.
+\`configure' configures sextractor 2.9.7 to adapt to many kinds of systems.
  
 Usage: $0 [OPTION]... [VAR=VALUE]...
  
@@ -1575,7 +1575,7 @@ fi
  
 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of sextractor 2.9.6:";;
+     short | recursive ) echo "Configuration of sextractor 2.9.7:";;
   esac
  cat <<\_ACEOF
  
@@ -1706,7 +1706,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-sextractor configure 2.9.6
+sextractor configure 2.9.7
 generated by GNU Autoconf 2.63
  
 Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
@@ -1720,7 +1720,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
  
-It was created by sextractor $as_me 2.9.6, which was
+It was created by sextractor $as_me 2.9.7, which was
 generated by GNU Autoconf 2.63.  Invocation command line was
  
  $ $0 $@
@@ -2423,7 +2423,7 @@ fi
  
 # Define the identity of the package.
 PACKAGE='sextractor'
- VERSION='2.9.6'
+ VERSION='2.9.7'
  
  
 cat >>confdefs.h <<_ACEOF
@@ -28291,7 +28291,7 @@ exec 6>&1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by sextractor $as_me 2.9.6, which was
+This file was extended by sextractor $as_me 2.9.7, which was
 generated by GNU Autoconf 2.63.  Invocation command line was
  
  CONFIG_FILES    = $CONFIG_FILES
@@ -28354,7 +28354,7 @@ Report bugs to <bug-autoconf@gnu.org>."
 _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_version="\\
-sextractor config.status 2.9.6
+sextractor config.status 2.9.7
 configured by $0, generated by GNU Autoconf 2.63,
  with options \\"`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`\\"
  

--- a/configure.ac
+++ b/configure.ac
@@ -6,7 +6,7 @@ define([AC_CACHE_LOAD],)
 define([AC_CACHE_SAVE],)

 # This is your standard Bertin source code...
-AC_INIT(sextractor, 2.9.6, [bertin@iap.fr])
+AC_INIT(sextractor, 2.9.7, [bertin@iap.fr])
 AC_CONFIG_SRCDIR(src/makeit.c)
 AC_CONFIG_AUX_DIR(autoconf)
 AM_CONFIG_HEADER(config.h)

--- a/man/sex.1
+++ b/man/sex.1
-.TH SEXTRACTOR "1" "October 2009" "SWarp 2.9.6" "User Commands"
+.TH SEXTRACTOR "1" "October 2009" "SWarp 2.9.7" "User Commands"
 .SH NAME
 sex \- extract a source catalog from an astronomical FITS image
 .SH SYNOPSIS

--- a/src/levmar/Axb_core.c
+++ b/src/levmar/Axb_core.c
@@ -36,15 +36,17 @@

 /* prototypes of LAPACK routines */

-#define GEQRF LM_ADD_PREFIX(geqrf_)
-#define ORGQR LM_ADD_PREFIX(orgqr_)
-#define TRTRS LM_ADD_PREFIX(trtrs_)
-#define POTF2 LM_ADD_PREFIX(potf2_)
-#define POTRF LM_ADD_PREFIX(potrf_)
-#define GETRF LM_ADD_PREFIX(getrf_)
-#define GETRS LM_ADD_PREFIX(getrs_)
-#define GESVD LM_ADD_PREFIX(gesvd_)
-#define GESDD LM_ADD_PREFIX(gesdd_)
+
+#define GEQRF LM_MK_LAPACK_NAME(geqrf)
+#define ORGQR LM_MK_LAPACK_NAME(orgqr)
+#define TRTRS LM_MK_LAPACK_NAME(trtrs)
+#define POTF2 LM_MK_LAPACK_NAME(potf2)
+#define POTRF LM_MK_LAPACK_NAME(potrf)
+#define POTRS LM_MK_LAPACK_NAME(potrs)
+#define GETRF LM_MK_LAPACK_NAME(getrf)
+#define GETRS LM_MK_LAPACK_NAME(getrs)
+#define GESVD LM_MK_LAPACK_NAME(gesvd)
+#define GESDD LM_MK_LAPACK_NAME(gesdd)

 /* QR decomposition */
 extern int GEQRF(int *m, int *n, LM_REAL *a, int *lda, LM_REAL *tau, LM_REAL *work, int *lwork, int *info);
@@ -53,9 +55,10 @@ extern int ORGQR(int *m, int *n, int *k, LM_REAL *a, int *lda, LM_REAL *tau, LM_
 /* solution of triangular systems */
 extern int TRTRS(char *uplo, char *trans, char *diag, int *n, int *nrhs, LM_REAL *a, int *lda, LM_REAL *b, int *ldb, int *info);

-/* cholesky decomposition */
+/* Cholesky decomposition and systems solution */
 extern int POTF2(char *uplo, int *n, LM_REAL *a, int *lda, int *info);
 extern int POTRF(char *uplo, int *n, LM_REAL *a, int *lda, int *info); /* block version of dpotf2 */
+extern int POTRS(char *uplo, int *n, int *nrhs, LM_REAL *a, int *lda, LM_REAL *b, int *ldb, int *info);

 /* LU decomposition and systems solution */
 extern int GETRF(int *m, int *n, LM_REAL *a, int *lda, int *ipiv, int *info);
@@ -88,7 +91,7 @@ extern int GESDD(char *jobz, int *m, int *n, LM_REAL *a, int *lda, LM_REAL *s, L
 *
 * A is mxm, b is mx1
 *
- * The function returns 0 in case of error, 1 if successfull
+ * The function returns 0 in case of error, 1 if successful
 *
 * This function is often called repetitively to solve problems of identical
 * dimensions. To avoid repetitive malloc's and free's, allocated memory is
@@ -100,6 +103,8 @@ int AX_EQ_B_QR(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m)
 __STATIC__ LM_REAL *buf=NULL;
 __STATIC__ int buf_sz=0;

+static int nb=0; /* no __STATIC__ decl. here! */
+
 LM_REAL *a, *qtb, *tau, *r, *work;
 int a_sz, qtb_sz, tau_sz, r_sz, tot_sz;
 register int i, j;
@@ -123,7 +128,15 @@ register LM_REAL sum;
    qtb_sz=m;
    tau_sz=m;
    r_sz=m*m; /* only the upper triangular part really needed */
-    worksz=3*m; /* this is probably too much */
+
+if(!nb){
+    LM_REAL tmp;
+
+    worksz=-1; // workspace query; optimal size is returned
+    GEQRF((int *)&m, (int *)&m, NULL, (int *)&m, NULL, (LM_
+    nb=((int)tmp)/m; // optimal worksize is m*nb
+}
+    worksz=nb*m;
    tot_sz=a_sz + qtb_sz + tau_sz + r_sz + worksz;

 #ifdef LINSOLVERS_RETAIN_MEMORY
@@ -244,7 +257,7 @@ register LM_REAL sum;
 *
 * A is mxn, b is mx1
 *
- * The function returns 0 in case of error, 1 if successfull
+ * The function returns 0 in case of error, 1 if successful
 *
 * This function is often called repetitively to solve problems of identical
 * dimensions. To avoid repetitive malloc's and free's, allocated memory is
@@ -256,6 +269,8 @@ int AX_EQ_B_QRLS(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m, int n)
 __STATIC__ LM_REAL *buf=NULL;
 __STATIC__ int buf_sz=0;

+static int nb=0; /* no __STATIC__ decl. here! */
+
 LM_REAL *a, *atb, *tau, *r, *work;
 int a_sz, atb_sz, tau_sz, r_sz, tot_sz;
 register int i, j;
@@ -284,7 +299,14 @@ register LM_REAL sum;
    atb_sz=n;
    tau_sz=n;
    r_sz=n*n;
-    worksz=3*n; /* this is probably too much */
+    if(!nb){
+      LM_REAL tmp;
+
+      worksz=-1; // workspace query; optimal size is returned
+      GEQRF((int *)&m, (int *)&m, NULL, (int *)&m, NULL, (LM_
+      nb=((int)tmp)/m; // optimal worksize is m*nb
+    }
+    worksz=nb*m;
    tot_sz=a_sz + atb_sz + tau_sz + r_sz + worksz;

 #ifdef LINSOLVERS_RETAIN_MEMORY
@@ -411,7 +433,7 @@ register LM_REAL sum;
 *
 * A is mxm, b is mx1
 *
- * The function returns 0 in case of error, 1 if successfull
+ * The function returns 0 in case of error, 1 if successful
 *
 * This function is often called repetitively to solve problems of identical
 * dimensions. To avoid repetitive malloc's and free's, allocated memory is
@@ -425,7 +447,7 @@ __STATIC__ int buf_sz=0;

 LM_REAL *a, *b;
 int a_sz, b_sz, tot_sz;
-register int i, j;
+register int i;
 int info, nrhs=1;

    if(!A)
@@ -468,24 +490,29 @@ int info, nrhs=1;
    a=buf;
    b=a+a_sz;

-  /* store A (column major!) into a anb B into b */
-	for(i=0; i<m; i++){
-		for(j=0; j<m; j++)
-			a[i+j*m]=A[i*m+j];
-
-    b[i]=B[i];
+  /* store A into a anb B into b. A is assumed symmetric,
+   * hence no transposition is needed
+   */
+  for(i=0; i<m; i++){
+     a[i]=A[i];
+     b[i]=B[i];
  }
+  for(i=m; i<m*m; i++)
+    a[i]=A[i];

  /* Cholesky decomposition of A */
-  POTF2("U", (int *)&m, a, (int *)&m, (int *)&info);
+  //POTF2("U", (int *)&m, a, (int *)&m, (int *)&info);
+  POTRF("U", (int *)&m, a, (int *)&m, (int *)&info);
  /* error treatment */
  if(info!=0){
    if(info<0){
-      fprintf(stderr, RCAT(RCAT("LAPACK error: illegal value for argument %d of ", POTF2) " in ", AX_EQ_B_CHOL) "()\n", -info);
+      fprintf(stderr, RCAT(RCAT(RCAT("LAPACK error: illegal value for argument %d of ", POTF2) "/", POTRF) " in ",
+                      AX_EQ_B_CHOL) "()\n", -info);
      exit(1);
    }
    else{
-      fprintf(stderr, RCAT(RCAT("LAPACK error: the leading minor of order %d is not positive definite,\nthe factorization could not be completed for ", POTF2) " in ", AX_EQ_B_CHOL) "()\n", info);
+      fprintf(stderr, RCAT(RCAT(RCAT("LAPACK error: the leading minor of order %d is not positive definite,\nthe factorization could not be completed for ", POTF2) "/", POTRF), " in ",
+                      AX_EQ_B_CHOL) "()\n", info);
 #ifndef LINSOLVERS_RETAIN_MEMORY
      free(buf);
 #endif
@@ -494,7 +521,15 @@ int info, nrhs=1;
    }
  }

-  /* solve the linear system U^T y = b */
+  /* solve using the computed Cholesky in one lapack call */
+  POTRS("U", (int *)&m, (int *)&nrhs, a, (int *)&m, b, (int *)&m, &info);
+  if(info<0){
+    fprintf(stderr, RCAT(RCAT("LAPACK error: illegal value for argument %d of ", POTRS) " in ", AX_EQ_B_CHOL) "()\n", -info);
+    exit(1);
+  }
+
+#if 0
+  /* alternative: solve the linear system U^T y = b ... */
  TRTRS("U", "T", "N", (int *)&m, (int *)&nrhs, a, (int *)&m, b, (int *)&m, &info);
  /* error treatment */
  if(info!=0){
@@ -512,7 +547,7 @@ int info, nrhs=1;
    }
  }

-  /* solve the linear system U x = y */
+  /* ... solve the linear system U x = y */
  TRTRS("U", "N", "N", (int *)&m, (int *)&nrhs, a, (int *)&m, b, (int *)&m, &info);
  /* error treatment */
  if(info!=0){
@@ -529,6 +564,7 @@ int info, nrhs=1;
      return 0;
    }
  }
+#endif /* 0 */

 	/* copy the result in x */
 	for(i=0; i<m; i++)
@@ -551,8 +587,7 @@ int info, nrhs=1;
 *
 * A is mxm, b is mx1
 *
- * The function returns 0 in case of error,
- * 1 if successfull
+ * The function returns 0 in case of error, 1 if successful
 *
 * This function is often called repetitively to solve problems of identical
 * dimensions. To avoid repetitive malloc's and free's, allocated memory is
@@ -564,10 +599,10 @@ int AX_EQ_B_LU(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m)
 __STATIC__ LM_REAL *buf=NULL;
 __STATIC__ int buf_sz=0;

-int a_sz, ipiv_sz, b_sz, work_sz, tot_sz;
+int a_sz, ipiv_sz, b_sz, tot_sz;
 register int i, j;
 int info, *ipiv, nrhs=1;
-LM_REAL *a, *b, *work;
+LM_REAL *a, *b;

    if(!A)
 #ifdef LINSOLVERS_RETAIN_MEMORY
@@ -585,15 +620,14 @@ LM_REAL *a, *b, *work;
    ipiv_sz=m;
    a_sz=m*m;
    b_sz=m;
-    work_sz=100*m; /* this is probably too much */
-    tot_sz=ipiv_sz + a_sz + b_sz + work_sz; // ipiv_sz counted as LM_REAL here, no harm is done though
+    tot_sz=(a_sz + b_sz)*sizeof(LM_REAL) + ipiv_sz*sizeof(int); /* should be arranged in that order for proper doubles alignment */

 #ifdef LINSOLVERS_RETAIN_MEMORY
    if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */
      if(buf) free(buf); /* free previously allocated memory */

      buf_sz=tot_sz;
-      buf=(LM_REAL *)malloc(buf_sz*sizeof(LM_REAL));
+      buf=(LM_REAL *)malloc(buf_sz);
      if(!buf){
        fprintf(stderr, RCAT("memory allocation in ", AX_EQ_B_LU) "() failed!\n");
        exit(1);
@@ -601,17 +635,16 @@ LM_REAL *a, *b, *work;
    }
 #else
      buf_sz=tot_sz;
-      buf=(LM_REAL *)malloc(buf_sz*sizeof(LM_REAL));
+      buf=(LM_REAL *)malloc(buf_sz);
      if(!buf){
        fprintf(stderr, RCAT("memory allocation in ", AX_EQ_B_LU) "() failed!\n");
        exit(1);
      }
 #endif /* LINSOLVERS_RETAIN_MEMORY */

-    ipiv=(int *)buf;
-    a=(LM_REAL *)(ipiv + ipiv_sz);
+    a=buf;
    b=a+a_sz;
-    work=b+b_sz;
+    ipiv=(int *)(b+b_sz);

    /* store A (column major!) into a and B into b */
 	  for(i=0; i<m; i++){
@@ -677,7 +710,7 @@ LM_REAL *a, *b, *work;
 *
 * A is mxm, b is mx1.
 *
- * The function returns 0 in case of error, 1 if successfull
+ * The function returns 0 in case of error, 1 if successful
 *
 * This function is often called repetitively to solve problems of identical
 * dimensions. To avoid repetitive malloc's and free's, allocated memory is
@@ -710,12 +743,21 @@ int info, rank, worksz, *iwork, iworksz;
 #endif /* LINSOLVERS_RETAIN_MEMORY */
   
  /* calculate required memory size */
-  worksz=16*m; /* more than needed */
+#if 1 /* use optimal size */
+  worksz=-1; // workspace query. Keep in mind that GESDD requires more memory than GESVD
+  /* note that optimal work size is returned in thresh */
+  GESVD("A", "A", (int *)&m, (int *)&m, NULL, (int *)&m, NULL, NULL, (int *)&m, NULL, (int *)&m, (LM_REAL *)&thresh, (int *)&worksz, &info);
+  //GESDD("A", (int *)&m, (int *)&m, NULL, (int *)&m, NULL, NULL, (int *)&m, NULL, (int *)&m, (LM_REAL *)&thresh, (int *)&worksz, NULL, &info);
+  worksz=(int)thresh;
+#else /* use minimum size */
+  worksz=5*m; // min worksize for GESVD
+  //worksz=m*(7*m+4); // min worksize for GESDD
+#endif
  iworksz=8*m;
  a_sz=m*m;
  u_sz=m*m; s_sz=m; vt_sz=m*m;

-  tot_sz=iworksz*sizeof(int) + (a_sz + u_sz + s_sz + vt_sz + worksz)*sizeof(LM_REAL);
+  tot_sz=(a_sz + u_sz + s_sz + vt_sz + worksz)*sizeof(LM_REAL) + iworksz*sizeof(int); /* should be arranged in that order for proper doubles alignment */

 #ifdef LINSOLVERS_RETAIN_MEMORY
  if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */
@@ -737,18 +779,18 @@ int info, rank, worksz, *iwork, iworksz;
    }
 #endif /* LINSOLVERS_RETAIN_MEMORY */

-  iwork=(int *)buf;
-  a=(LM_REAL *)(iwork+iworksz);
+  a=buf;
+  u=a+a_sz;
+  s=u+u_sz;
+  vt=s+s_sz;
+  work=vt+vt_sz;
+  iwork=(int *)(work+worksz);
+
  /* store A (column major!) into a */
  for(i=0; i<m; i++)
    for(j=0; j<m; j++)
      a[i+j*m]=A[i*m+j];

-  u=a + a_sz;
-  s=u+u_sz;
-  vt=s+s_sz;
-  work=vt+vt_sz;
-
  /* SVD decomposition of A */
  GESVD("A", "A", (int *)&m, (int *)&m, a, (int *)&m, s, u, (int *)&m, vt, (int *)&m, work, (int *)&worksz, &info);
  //GESDD("A", (int *)&m, (int *)&m, a, (int *)&m, s, u, (int *)&m, vt, (int *)&m, work, (int *)&worksz, iwork, &info);
@@ -845,8 +887,7 @@ extern int GETRS(char *trans, int *n, int *nrhs, LM_REAL *a, int *lda, int *ipiv
 *
 * A is mxm, b is mx1
 *
- * The function returns 0 in case of error,
- * 1 if successfull
+ * The function returns 0 in case of error, 1 if successful
 *
 * This function is often called repetitively to solve problems of identical
 * dimensions. To avoid repetitive malloc's and free's, allocated memory is
@@ -862,7 +903,7 @@ __STATIC__ int buf_sz=0;
 int a_sz, ipiv_sz, b_sz, work_sz, tot_sz;
 register int i, j;
 int info, *ipiv;
-LM_REAL *a, *b, *work;
+LM_REAL *a, *b;
    if(!A)
 #ifdef LINSOLVERS_RETAIN_MEMORY
    {
@@ -879,15 +920,14 @@ LM_REAL *a, *b, *work;
    ipiv_sz=m;
    a_sz=m*m;
    b_sz=m;
-    work_sz=100*m; /* this is probably too much */
-    tot_sz=ipiv_sz + a_sz + b_sz + work_sz; // ipiv_sz counted as LM_REAL here, no harm is done though
+    tot_sz=(ipiv_sz + a_sz + b_sz)*sizeof(LM_REAL);

 #ifdef LINSOLVERS_RETAIN_MEMORY
    if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */
      if(buf) free(buf); /* free previously allocated memory */

      buf_sz=tot_sz;
-      buf=(LM_REAL *)malloc(buf_sz*sizeof(LM_REAL));
+      buf=(void *)malloc(buf_sz);
      if(!buf){
        fprintf(stderr, RCAT("memory allocation in ", AX_EQ_B_LU) "() failed!\n");
        exit(1);
@@ -895,7 +935,7 @@ LM_REAL *a, *b, *work;
    }
 #else
      buf_sz=tot_sz;
-      buf=(LM_REAL *)malloc(buf_sz*sizeof(LM_REAL));
+      buf=(void *))malloc(buf_sz);
      if(!buf){
        fprintf(stderr, RCAT("memory allocation in ", AX_EQ_B_LU) "() failed!\n");
        exit(1);
@@ -905,7 +945,6 @@ LM_REAL *a, *b, *work;
    ipiv=(int *)buf;
    a=(LM_REAL *)(ipiv + ipiv_sz);
    b=a+a_sz;
-    work=b+b_sz;

    /* store A (column major!) into a and B into b */
 	  for(i=0; i<m; i++){

--- a/src/levmar/README
+++ b/src/levmar/README
-The levmar v2.3 library has been included in this package untouched, except for
+The levmar v2.4 library has been included in this package untouched, except for
 three warnings removed, LU decomposition replaced with calls to ATLAS-Lapack
 routines, Hessian matrix inversion done with SVD, and an AutoMakefile added.
 			Emmanuel Bertin <bertin@iap.fr>
--- a/src/levmar/README.txt
+++ b/src/levmar/README.txt
    **************************************************************
                                LEVMAR
-                              version 2.3
+                              version 2.4
                          By Manolis Lourakis

                     Institute of Computer Science
@@ -28,7 +28,7 @@ It is strongly recommended that you *do* employ LAPACK; if you don't have it alr
 I suggest getting clapack from http://www.netlib.org/clapack. However, LAPACK's
 use is not mandatory and the 2nd option makes levmar totally self-contained.
 See lmdemo.c for examples of use and http://www.ics.forth.gr/~lourakis/levmar
-for general comments.
+for general comments. An example of using levmar for data fitting is in expfit.c

 The mathematical theory behind levmar is described in the lecture notes entitled
 "Methods for Non-Linear Least Squares Problems", by K. Madsen, H.B. Nielsen and O. Tingleff,
@@ -52,6 +52,10 @@ COMPILATION
 - Under Windows and if Visual C is installed & configured for command line
   use, type "nmake /f Makefile.vc" in a cmd window to build levmar and the
   demo program. In case of trouble, read the comments on top of Makefile.vc
+   Visual C++ project files (levmar.vcproj and lmdemo.vcproj) are also included,
+   however they are not supported and are only meant to serve as a starting point
+   for creating your own. Check http://www.arstdesign.com/articles/prjconverter.html
+   if you need to convert to .dsw/.dsp (i.e., Visual C++ 6.0) project files.

 - levmar can also be built under various platforms using the CMake cross-platform
   build system. The included CMakeLists.txt file can be used to generate makefiles

--- a/src/levmar/lm.h
+++ b/src/levmar/lm.h
@@ -27,7 +27,7 @@

 /* specify whether to use LAPACK or not. The first option is strongly recommended */
 #define HAVE_LAPACK /* use LAPACK */
-#undef HAVE_LAPACK  /* uncomment this to force not using LAPACK */
+#undef HAVE_LAPACK */  /* uncomment this to force not using LAPACK */

 /* to avoid the overhead of repeated mallocs(), routines in Axb.c can be instructed to
 * retain working memory between calls. Such a choice, however, renders these routines
@@ -79,12 +79,12 @@ extern "C" {
 #define LM_BLEC_DIF_WORKSZ(npar, nmeas, nconstr) LM_LEC_DIF_WORKSZ((npar), (nmeas)+(npar), (nconstr))

 #define LM_OPTS_SZ    	 5 /* max(4, 5) */
-#define LM_INFO_SZ    	 9
+#define LM_INFO_SZ    	 10
 #define LM_ERROR         -1
 #define LM_INIT_MU    	 1E-03
 #define LM_STOP_THRESH	 1E-17
 #define LM_DIFF_DELTA    1E-06
-#define LM_VERSION       "2.3 (May 2008)"
+#define LM_VERSION       "2.4 (April 2009)"

 #ifdef LM_DBL_PREC
 /* double precision LM, with & without Jacobian */
@@ -241,15 +241,18 @@ extern void slevmar_chkjac(
    float *p, int m, int n, void *adata, float *err);
 #endif /* LM_SNGL_PREC */

-/* standard deviation & Pearson's correlation coefficient for best-fit parameters */
+/* standard deviation, coefficient of determination (R2) & Pearson's correlation coefficient for best-fit parameters */
 #ifdef LM_DBL_PREC
 extern double dlevmar_stddev( double *covar, int m, int i);
 extern double dlevmar_corcoef(double *covar, int m, int i, int j);
+extern double dlevmar_R2(void (*func)(double *p, double *hx, int m, int n, void *adata), double *p, double *x, int m, int n, void *adata);
+
 #endif /* LM_DBL_PREC */

 #ifdef LM_SNGL_PREC
 extern float slevmar_stddev( float *covar, int m, int i);
 extern float slevmar_corcoef(float *covar, int m, int i, int j);
+extern float slevmar_R2(void (*func)(float *p, float *hx, int m, int n, void *adata), float *p, float *x, int m, int n, void *adata);
 #endif /* LM_SNGL_PREC */

 #ifdef __cplusplus

--- a/src/levmar/lm_core.c
+++ b/src/levmar/lm_core.c
@@ -50,7 +50,7 @@
 * This function requires an analytic Jacobian. In case the latter is unavailable,
 * use LEVMAR_DIF() bellow
 *
- * Returns the number of iterations (>=0) if successfull, LM_ERROR if failed
+ * Returns the number of iterations (>=0) if successful, LM_ERROR if failed
 *
 * For more details, see K. Madsen, H.B. Nielsen and O. Tingleff's lecture notes on 
 * non-linear least squares at http://www.imm.dtu.dk/pubdb/views/edoc_download.php/3215/pdf/imm3215.pdf
@@ -81,6 +81,7 @@ int LEVMAR_DER(
                      *                                 7 - stopped by invalid (i.e. NaN or Inf) "func" values. This is a user error
                      * info[7]= # function evaluations
                      * info[8]= # Jacobian evaluations
+                      * info[9]= # linear systems solved, i.e. # attempts for reducing error
                      */
  LM_REAL *work,     /* working memory at least LM_DER_WORKSZ() reals large, allocated if NULL */
  LM_REAL *covar,    /* O: Covariance matrix corresponding to LS solution; mxm. Set to NULL if not needed. */
@@ -106,7 +107,7 @@ LM_REAL p_eL2, jacTe_inf, pDp_eL2; /* ||e(p)||_2, ||J^T e||_inf, ||e(p+Dp)||_2 *
 LM_REAL p_L2, Dp_L2=LM_REAL_MAX, dF, dL;
 LM_REAL tau, eps1, eps2, eps2_sq, eps3;
 LM_REAL init_p_eL2;
-int nu=2, nu2, stop=0, nfev, njev=0;
+int nu=2, nu2, stop=0, nfev, njev=0, nlss=0;
 const int nm=n*m;
 int (*linsolver)(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m)=NULL;

@@ -143,7 +144,7 @@ int (*linsolver)(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m)=NULL;
    work=(LM_REAL *)malloc(worksz*sizeof(LM_REAL)); /* allocate a big chunk in one step */
    if(!work){
      fprintf(stderr, LCAT(LEVMAR_DER, "(): memory allocation request failed\n"));
-      exit(1);
+      return LM_ERROR;
    }
    freework=1;
  }
@@ -181,7 +182,7 @@ int (*linsolver)(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m)=NULL;
    }

    /* Compute the Jacobian J at p,  J^T J,  J^T e,  ||J^T e||_inf and ||p||^2.
-     * Since J^T J is symmetric, its computation can be speeded up by computing
+     * Since J^T J is symmetric, its computation can be sped up by computing
     * only its upper triangular part and copying it to the lower part
     */

@@ -189,38 +190,51 @@ int (*linsolver)(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m)=NULL;

    /* J^T J, J^T e */
    if(nm<__BLOCKSZ__SQ){ // this is a small problem
-      /* This is the straightforward way to compute J^T J, J^T e. However, due to
-       * its noncontinuous memory access pattern, it incures many cache misses when
-       * applied to large minimization problems (i.e. problems involving a large
-       * number of free variables and measurements), in which J is too large to
-       * fit in the L1 cache. For such problems, a cache-efficient blocking scheme
-       * is preferable.
+      /* J^T*J_ij = \sum_l J^T_il * J_lj = \sum_l J_li * J_lj.
+       * Thus, the product J^T J can be computed using an outer loop for
+       * l that adds J_li*J_lj to each element ij of the result. Note that
+       * with this scheme, the accesses to J and JtJ are always along rows,
+       * therefore induces less cache misses compared to the straightforward
+       * algorithm for computing the product (i.e., l loop is innermost one).
+       * A similar scheme applies to the computation of J^T e.
+       * However, for large minimization problems (i.e., involving a large number
+       * of unknowns and measurements) for which J/J^T J rows are too large to
+       * fit in the L1 cache, even this scheme incures many cache misses. In
+       * such cases, a cache-efficient blocking scheme is preferable.
       *
       * Thanks to John Nitao of Lawrence Livermore Lab for pointing out this
       * performance problem.
       *
-       * On the other hand, the straightforward algorithm is faster on small
+       * Note that the non-blocking algorithm is faster on small
       * problems since in this case it avoids the overheads of blocking. 
       */

-      for(i=0; i<m; ++i){
-        for(j=i; j<m; ++j){
-          int lm;
+      /* looping downwards saves a few computations */
+      register int l, im;
+      register LM_REAL alpha, *jaclm;

-          for(l=0, tmp=0.0; l<n; ++l){
-            lm=l*m;
-            tmp+=jac[lm+i]*jac[lm+j];
-          }
+      for(i=m*m; i-->0; )
+        jacTjac[i]=0.0;
+      for(i=m; i-->0; )
+        jacTe[i]=0.0;

-		      /* store tmp in the corresponding upper and lower part elements */
-          jacTjac[i*m+j]=jacTjac[j*m+i]=tmp;
-        }
+      for(l=n; l-->0; ){
+        jaclm=jac+l*m;
+        for(i=m; i-->0; ){
+          im=i*m;
+          alpha=jaclm[i]; //jac[l*m+i];
+          for(j=i+1; j-->0; ) /* j<=i computes lower triangular part only */
+            jacTjac[im+j]+=jaclm[j]*alpha; //jac[l*m+j]

-        /* J^T e */
-        for(l=0, tmp=0.0; l<n; ++l)
-          tmp+=jac[l*m+i]*e[l];
-        jacTe[i]=tmp;
+          /* J^T e */
+          jacTe[i]+=alpha*e[l];
+        }
      }
+
+      for(i=m; i-->0; ) /* copy to upper part */
+        for(j=i+1; j<m; ++j)
+          jacTjac[i*m+j]=jacTjac[j*m+i];
+
    }
    else{ // this is a large problem
      /* Cache efficient computation of J^T J based on blocking
@@ -284,15 +298,15 @@ if(!(k%100)){
       * SVD is the slowest but most accurate; LU offers a tradeoff between accuracy and speed
       */

-      issolved=AX_EQ_B_LU(jacTjac, jacTe, Dp, m); linsolver=AX_EQ_B_LU;
-      //issolved=AX_EQ_B_CHOL(jacTjac, jacTe, Dp, m); linsolver=AX_EQ_B_CHOL;
-      //issolved=AX_EQ_B_QR(jacTjac, jacTe, Dp, m); linsolver=AX_EQ_B_QR;
-      //issolved=AX_EQ_B_QRLS(jacTjac, jacTe, Dp, m, m); linsolver=AX_EQ_B_QRLS;
-      //issolved=AX_EQ_B_SVD(jacTjac, jacTe, Dp, m); linsolver=AX_EQ_B_SVD;
+      issolved=AX_EQ_B_LU(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_LU;
+      //issolved=AX_EQ_B_CHOL(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_CHOL;
+      //issolved=AX_EQ_B_QR(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_QR;
+      //issolved=AX_EQ_B_QRLS(jacTjac, jacTe, Dp, m, m); ++nlss; linsolver=(int (*)(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m))AX_EQ_B_QRLS;
+      //issolved=AX_EQ_B_SVD(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_SVD;

 #else
      /* use the LU included with levmar */
-      issolved=AX_EQ_B_LU(jacTjac, jacTe, Dp, m); linsolver=AX_EQ_B_LU;
+      issolved=AX_EQ_B_LU(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_LU;
 #endif /* HAVE_LAPACK */

      if(issolved){
@@ -389,6 +403,7 @@ if(!(k%100)){
    info[6]=(LM_REAL)stop;
    info[7]=(LM_REAL)nfev;
    info[8]=(LM_REAL)njev;
+    info[9]=(LM_REAL)nlss;
  }

  /* covariance matrix */
@@ -436,6 +451,7 @@ int LEVMAR_DIF(
                      *                                 7 - stopped by invalid (i.e. NaN or Inf) "func" values. This is a user error
                      * info[7]= # function evaluations
                      * info[8]= # Jacobian evaluations
+                      * info[9]= # linear systems solved, i.e. # attempts for reducing error
                      */
  LM_REAL *work,     /* working memory at least LM_DIF_WORKSZ() reals large, allocated if NULL */
  LM_REAL *covar,    /* O: Covariance matrix corresponding to LS solution; mxm. Set to NULL if not needed. */
@@ -465,7 +481,7 @@ LM_REAL p_eL2, jacTe_inf, pDp_eL2; /* ||e(p)||_2, ||J^T e||_inf, ||e(p+Dp)||_2 *
 LM_REAL p_L2, Dp_L2=LM_REAL_MAX, dF, dL;
 LM_REAL tau, eps1, eps2, eps2_sq, eps3, delta;
 LM_REAL init_p_eL2;
-int nu, nu2, stop=0, nfev, njap=0, K=(m>=10)? m: 10, updjac, updp=1, newjac;
+int nu, nu2, stop=0, nfev, njap=0, nlss=0, K=(m>=10)? m: 10, updjac, updp=1, newjac;
 const int nm=n*m;
 int (*linsolver)(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m)=NULL;

@@ -503,7 +519,7 @@ int (*linsolver)(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m)=NULL;
    work=(LM_REAL *)malloc(worksz*sizeof(LM_REAL)); /* allocate a big chunk in one step */
    if(!work){
      fprintf(stderr, LCAT(LEVMAR_DIF, "(): memory allocation request failed\n"));
-      exit(1);
+      return LM_ERROR;
    }
    freework=1;
  }
@@ -565,37 +581,49 @@ int (*linsolver)(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m)=NULL;

      /* J^T J, J^T e */
      if(nm<=__BLOCKSZ__SQ){ // this is a small problem
-        /* This is the straightforward way to compute J^T J, J^T e. However, due to
-         * its noncontinuous memory access pattern, it incures many cache misses when
-         * applied to large minimization problems (i.e. problems involving a large
-         * number of free variables and measurements), in which J is too large to
-         * fit in the L1 cache. For such problems, a cache-efficient blocking scheme
-         * is preferable.
+        /* J^T*J_ij = \sum_l J^T_il * J_lj = \sum_l J_li * J_lj.
+         * Thus, the product J^T J can be computed using an outer loop for
+         * l that adds J_li*J_lj to each element ij of the result. Note that
+         * with this scheme, the accesses to J and JtJ are always along rows,
+         * therefore induces less cache misses compared to the straightforward
+         * algorithm for computing the product (i.e., l loop is innermost one).
+         * A similar scheme applies to the computation of J^T e.
+         * However, for large minimization problems (i.e., involving a large number
+         * of unknowns and measurements) for which J/J^T J rows are too large to
+         * fit in the L1 cache, even this scheme incures many cache misses. In
+         * such cases, a cache-efficient blocking scheme is preferable.
         *
         * Thanks to John Nitao of Lawrence Livermore Lab for pointing out this
         * performance problem.
         *
-         * On the other hand, the straightforward algorithm is faster on small
+         * Note that the non-blocking algorithm is faster on small
         * problems since in this case it avoids the overheads of blocking. 
         */
-      
-        for(i=0; i<m; ++i){
-          for(j=i; j<m; ++j){
-            int lm;
+        register int l, im;
+        register LM_REAL alpha, *jaclm;

-            for(l=0, tmp=0.0; l<n; ++l){
-              lm=l*m;
-              tmp+=jac[lm+i]*jac[lm+j];
-            }
+        /* looping downwards saves a few computations */
+        for(i=m*m; i-->0; )
+          jacTjac[i]=0.0;
+        for(i=m; i-->0; )
+          jacTe[i]=0.0;

-            jacTjac[i*m+j]=jacTjac[j*m+i]=tmp;
-          }
+        for(l=n; l-->0; ){
+          jaclm=jac+l*m;
+          for(i=m; i-->0; ){
+            im=i*m;
+            alpha=jaclm[i]; //jac[l*m+i];
+            for(j=i+1; j-->0; ) /* j<=i computes lower triangular part only */
+              jacTjac[im+j]+=jaclm[j]*alpha; //jac[l*m+j]

-          /* J^T e */
-          for(l=0, tmp=0.0; l<n; ++l)
-            tmp+=jac[l*m+i]*e[l];
-          jacTe[i]=tmp;
+            /* J^T e */
+            jacTe[i]+=alpha*e[l];
+          }
        }
+
+        for(i=m; i-->0; ) /* copy to upper part */
+          for(j=i+1; j<m; ++j)
+            jacTjac[i*m+j]=jacTjac[j*m+i];
      }
      else{ // this is a large problem
        /* Cache efficient computation of J^T J based on blocking
@@ -660,14 +688,14 @@ if(!(k%100)){
     * SVD is the slowest but most accurate; LU offers a tradeoff between accuracy and speed
     */

-    issolved=AX_EQ_B_LU(jacTjac, jacTe, Dp, m); linsolver=AX_EQ_B_LU;
-    //issolved=AX_EQ_B_CHOL(jacTjac, jacTe, Dp, m); linsolver=AX_EQ_B_CHOL;
-    //issolved=AX_EQ_B_QR(jacTjac, jacTe, Dp, m); linsolver=AX_EQ_B_QR;
-    //issolved=AX_EQ_B_QRLS(jacTjac, jacTe, Dp, m, m); linsolver=AX_EQ_B_QRLS;
-    //issolved=AX_EQ_B_SVD(jacTjac, jacTe, Dp, m); linsolver=AX_EQ_B_SVD;
+    issolved=AX_EQ_B_LU(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_LU;
+    //issolved=AX_EQ_B_CHOL(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_CHOL;
+    //issolved=AX_EQ_B_QR(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_QR;
+    //issolved=AX_EQ_B_QRLS(jacTjac, jacTe, Dp, m, m); ++nlss; linsolver=(int (*)(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m))AX_EQ_B_QRLS;
+    //issolved=AX_EQ_B_SVD(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_SVD;
 #else
    /* use the LU included with levmar */
-    issolved=AX_EQ_B_LU(jacTjac, jacTe, Dp, m); linsolver=AX_EQ_B_LU;
+    issolved=AX_EQ_B_LU(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_LU;
 #endif /* HAVE_LAPACK */

    if(issolved){
@@ -778,6 +806,7 @@ if(!(k%100)){
    info[6]=(LM_REAL)stop;
    info[7]=(LM_REAL)nfev;
    info[8]=(LM_REAL)njap;
+    info[9]=(LM_REAL)nlss;
  }

  /* covariance matrix */

--- a/src/levmar/lmbc_core.c
+++ b/src/levmar/lmbc_core.c
@@ -28,7 +28,7 @@
 #define BOXPROJECT LM_ADD_PREFIX(boxProject)
 #define LEVMAR_BOX_CHECK LM_ADD_PREFIX(levmar_box_check)
 #define LEVMAR_BC_DER LM_ADD_PREFIX(levmar_bc_der)
-#define LEVMAR_BC_DIF LM_ADD_PREFIX(levmar_bc_dif) //CHECKME
+#define LEVMAR_BC_DIF LM_ADD_PREFIX(levmar_bc_dif)
 #define LEVMAR_FDIF_FORW_JAC_APPROX LM_ADD_PREFIX(levmar_fdif_forw_jac_approx)
 #define LEVMAR_FDIF_CENT_JAC_APPROX LM_ADD_PREFIX(levmar_fdif_cent_jac_approx)
 #define LEVMAR_TRANS_MAT_MAT_MULT LM_ADD_PREFIX(levmar_trans_mat_mat_mult)
@@ -264,7 +264,7 @@ register int i;
 * This function requires an analytic Jacobian. In case the latter is unavailable,
 * use LEVMAR_BC_DIF() bellow
 *
- * Returns the number of iterations (>=0) if successfull, LM_ERROR if failed
+ * Returns the number of iterations (>=0) if successful, LM_ERROR if failed
 *
 * For details, see C. Kanzow, N. Yamashita and M. Fukushima: "Levenberg-Marquardt
 * methods for constrained nonlinear equations with strong local convergence properties",
@@ -301,6 +301,7 @@ int LEVMAR_BC_DER(
                      *                                 7 - stopped by invalid (i.e. NaN or Inf) "func" values. This is a user error
                      * info[7]= # function evaluations
                      * info[8]= # Jacobian evaluations
+                      * info[9]= # linear systems solved, i.e. # attempts for reducing error
                      */
  LM_REAL *work,     /* working memory at least LM_BC_DER_WORKSZ() reals large, allocated if NULL */
  LM_REAL *covar,    /* O: Covariance matrix corresponding to LS solution; mxm. Set to NULL if not needed. */
@@ -326,7 +327,7 @@ LM_REAL p_eL2, jacTe_inf, pDp_eL2; /* ||e(p)||_2, ||J^T e||_inf, ||e(p+Dp)||_2 *
 LM_REAL p_L2, Dp_L2=LM_REAL_MAX, dF, dL;
 LM_REAL tau, eps1, eps2, eps2_sq, eps3;
 LM_REAL init_p_eL2;
-int nu=2, nu2, stop=0, nfev, njev=0;
+int nu=2, nu2, stop=0, nfev, njev=0, nlss=0;
 const int nm=n*m;

 /* variables for constrained LM */
@@ -378,7 +379,7 @@ int (*linsolver)(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m)=NULL;
    work=(LM_REAL *)malloc(worksz*sizeof(LM_REAL)); /* allocate a big chunk in one step */
    if(!work){
      fprintf(stderr, LCAT(LEVMAR_BC_DER, "(): memory allocation request failed\n"));
-      exit(1);
+      return LM_ERROR;
    }
    freework=1;
  }
@@ -431,7 +432,7 @@ int (*linsolver)(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m)=NULL;
    }

    /* Compute the Jacobian J at p,  J^T J,  J^T e,  ||J^T e||_inf and ||p||^2.
-     * Since J^T J is symmetric, its computation can be speeded up by computing
+     * Since J^T J is symmetric, its computation can be sped up by computing
     * only its upper triangular part and copying it to the lower part
     */

@@ -439,38 +440,49 @@ int (*linsolver)(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m)=NULL;

    /* J^T J, J^T e */
    if(nm<__BLOCKSZ__SQ){ // this is a small problem
-      /* This is the straightforward way to compute J^T J, J^T e. However, due to
-       * its noncontinuous memory access pattern, it incures many cache misses when
-       * applied to large minimization problems (i.e. problems involving a large
-       * number of free variables and measurements), in which J is too large to
-       * fit in the L1 cache. For such problems, a cache-efficient blocking scheme
-       * is preferable.
+      /* J^T*J_ij = \sum_l J^T_il * J_lj = \sum_l J_li * J_lj.
+       * Thus, the product J^T J can be computed using an outer loop for
+       * l that adds J_li*J_lj to each element ij of the result. Note that
+       * with this scheme, the accesses to J and JtJ are always along rows,
+       * therefore induces less cache misses compared to the straightforward
+       * algorithm for computing the product (i.e., l loop is innermost one).
+       * A similar scheme applies to the computation of J^T e.
+       * However, for large minimization problems (i.e., involving a large number
+       * of unknowns and measurements) for which J/J^T J rows are too large to
+       * fit in the L1 cache, even this scheme incures many cache misses. In
+       * such cases, a cache-efficient blocking scheme is preferable.
       *
       * Thanks to John Nitao of Lawrence Livermore Lab for pointing out this
       * performance problem.
       *
-       * On the other hand, the straightforward algorithm is faster on small
+       * Note that the non-blocking algorithm is faster on small
       * problems since in this case it avoids the overheads of blocking. 
       */
+      register int l, im;
+      register LM_REAL alpha, *jaclm;

-      for(i=0; i<m; ++i){
-        for(j=i; j<m; ++j){
-          int lm;
+      /* looping downwards saves a few computations */
+      for(i=m*m; i-->0; )
+        jacTjac[i]=0.0;
+      for(i=m; i-->0; )
+        jacTe[i]=0.0;

-          for(l=0, tmp=0.0; l<n; ++l){
-            lm=l*m;
-            tmp+=jac[lm+i]*jac[lm+j];
-          }
+      for(l=n; l-->0; ){
+        jaclm=jac+l*m;
+        for(i=m; i-->0; ){
+          im=i*m;
+          alpha=jaclm[i]; //jac[l*m+i];
+          for(j=i+1; j-->0; ) /* j<=i computes lower triangular part only */
+            jacTjac[im+j]+=jaclm[j]*alpha; //jac[l*m+j]

-		      /* store tmp in the corresponding upper and lower part elements */
-          jacTjac[i*m+j]=jacTjac[j*m+i]=tmp;
+          /* J^T e */
+          jacTe[i]+=alpha*e[l];
        }
-
-        /* J^T e */
-        for(l=0, tmp=0.0; l<n; ++l)
-          tmp+=jac[l*m+i]*e[l];
-        jacTe[i]=tmp;
      }
+
+      for(i=m; i-->0; ) /* copy to upper part */
+        for(j=i+1; j<m; ++j)
+          jacTjac[i*m+j]=jacTjac[j*m+i];
    }
    else{ // this is a large problem
      /* Cache efficient computation of J^T J based on blocking
@@ -544,15 +556,15 @@ if(!(k%100)){
       * SVD is the slowest but most accurate; LU offers a tradeoff between accuracy and speed
       */

-      issolved=AX_EQ_B_LU(jacTjac, jacTe, Dp, m); linsolver=AX_EQ_B_LU;
-      //issolved=AX_EQ_B_CHOL(jacTjac, jacTe, Dp, m); linsolver=AX_EQ_B_CHOL;
-      //issolved=AX_EQ_B_QR(jacTjac, jacTe, Dp, m); linsolver=AX_EQ_B_QR;
-      //issolved=AX_EQ_B_QRLS(jacTjac, jacTe, Dp, m, m); linsolver=AX_EQ_B_QRLS;
-      //issolved=AX_EQ_B_SVD(jacTjac, jacTe, Dp, m); linsolver=AX_EQ_B_SVD;
+      issolved=AX_EQ_B_LU(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_LU;
+      //issolved=AX_EQ_B_CHOL(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_CHOL;
+      //issolved=AX_EQ_B_QR(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_QR;
+      //issolved=AX_EQ_B_QRLS(jacTjac, jacTe, Dp, m, m); ++nlss; linsolver=(int (*)(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m))AX_EQ_B_QRLS;
+      //issolved=AX_EQ_B_SVD(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_SVD;

 #else
      /* use the LU included with levmar */
-      issolved=AX_EQ_B_LU(jacTjac, jacTe, Dp, m); linsolver=AX_EQ_B_LU;
+      issolved=AX_EQ_B_LU(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_LU;
 #endif /* HAVE_LAPACK */

      if(issolved){
@@ -784,6 +796,7 @@ breaknested: /* NOTE: this point is also reached via an explicit goto! */
    info[6]=(LM_REAL)stop;
    info[7]=(LM_REAL)nfev;
    info[8]=(LM_REAL)njev;
+    info[9]=(LM_REAL)nlss;
  }

  /* covariance matrix */
@@ -808,13 +821,14 @@ printf("%d LM steps, %d line search, %d projected gradient\n", nLMsteps, nLSstep
 * version of LEVMAR_BC_DIF() is implemented...
 */
 struct LMBC_DIF_DATA{
+  int ffdif; // nonzero if forward differencing is used
  void (*func)(LM_REAL *p, LM_REAL *hx, int m, int n, void *adata);
  LM_REAL *hx, *hxx;
  void *adata;
  LM_REAL delta;
 };

-void LMBC_DIF_FUNC(LM_REAL *p, LM_REAL *hx, int m, int n, void *data)
+static void LMBC_DIF_FUNC(LM_REAL *p, LM_REAL *hx, int m, int n, void *data)
 {
 struct LMBC_DIF_DATA *dta=(struct LMBC_DIF_DATA *)data;

@@ -822,13 +836,17 @@ struct LMBC_DIF_DATA *dta=(struct LMBC_DIF_DATA *)data;
  (*(dta->func))(p, hx, m, n, dta->adata);
 }

-void LMBC_DIF_JACF(LM_REAL *p, LM_REAL *jac, int m, int n, void *data)
+static void LMBC_DIF_JACF(LM_REAL *p, LM_REAL *jac, int m, int n, void *data)
 {
 struct LMBC_DIF_DATA *dta=(struct LMBC_DIF_DATA *)data;

-  /* evaluate user-supplied function at p */
-  (*(dta->func))(p, dta->hx, m, n, dta->adata);
-  LEVMAR_FDIF_FORW_JAC_APPROX(dta->func, p, dta->hx, dta->hxx, dta->delta, jac, m, n, dta->adata);
+  if(dta->ffdif){
+    /* evaluate user-supplied function at p */
+    (*(dta->func))(p, dta->hx, m, n, dta->adata);
+    LEVMAR_FDIF_FORW_JAC_APPROX(dta->func, p, dta->hx, dta->hxx, dta->delta, jac, m, n, dta->adata);
+  }
+  else
+    LEVMAR_FDIF_CENT_JAC_APPROX(dta->func, p, dta->hx, dta->hxx, dta->delta, jac, m, n, dta->adata);
 }


@@ -866,6 +884,7 @@ int LEVMAR_BC_DIF(
                      *                                 7 - stopped by invalid (i.e. NaN or Inf) "func" values. This is a user error
                      * info[7]= # function evaluations
                      * info[8]= # Jacobian evaluations
+                      * info[9]= # linear systems solved, i.e. # attempts for reducing error
                      */
  LM_REAL *work,     /* working memory at least LM_BC_DIF_WORKSZ() reals large, allocated if NULL */
  LM_REAL *covar,    /* O: Covariance matrix corresponding to LS solution; mxm. Set to NULL if not needed. */
@@ -876,27 +895,34 @@ int LEVMAR_BC_DIF(
 struct LMBC_DIF_DATA data;
 int ret;

-    //fprintf(stderr, RCAT("\nWarning: current implementation of ", LEVMAR_BC_DIF) "() does not use a secant approach!\n\n");
+  //fprintf(stderr, RCAT("\nWarning: current implementation of ", LEVMAR_BC_DIF) "() does not use a secant approach!\n\n");

-    data.func=func;
-    data.hx=(LM_REAL *)malloc(2*n*sizeof(LM_REAL)); /* allocate a big chunk in one step */
-    if(!data.hx){
-      fprintf(stderr, LCAT(LEVMAR_BC_DIF, "(): memory allocation request failed\n"));
-      exit(1);
-    }
-    data.hxx=data.hx+n;
-    data.adata=adata;
-    data.delta=(opts)? FABS(opts[4]) : (LM_REAL)LM_DIFF_DELTA; // no central differences here...
+  data.ffdif=!opts || opts[4]>=0.0;

-    ret=LEVMAR_BC_DER(LMBC_DIF_FUNC, LMBC_DIF_JACF, p, x, m, n, lb, ub, itmax, opts, info, work, covar, (void *)&data);
+  data.func=func;
+  data.hx=(LM_REAL *)malloc(2*n*sizeof(LM_REAL)); /* allocate a big chunk in one step */
+  if(!data.hx){
+    fprintf(stderr, LCAT(LEVMAR_BC_DIF, "(): memory allocation request failed\n"));
+    return LM_ERROR;
+  }
+  data.hxx=data.hx+n;
+  data.adata=adata;
+  data.delta=(opts)? FABS(opts[4]) : (LM_REAL)LM_DIFF_DELTA;
+
+  ret=LEVMAR_BC_DER(LMBC_DIF_FUNC, LMBC_DIF_JACF, p, x, m, n, lb, ub, itmax, opts, info, work, covar, (void *)&data);

-    if(info) /* correct the number of function calls */
+  if(info){ /* correct the number of function calls */
+    if(data.ffdif)
      info[7]+=info[8]*(m+1); /* each Jacobian evaluation costs m+1 function calls */
+    else
+      info[7]+=info[8]*(2*m); /* each Jacobian evaluation costs 2*m function calls */
+  }

-    free(data.hx);
+  free(data.hx);

-    return ret;
+  return ret;
 }
+
 /* undefine everything. THIS MUST REMAIN AT THE END OF THE FILE */
 #undef FUNC_STATE
 #undef LNSRCH

--- a/src/levmar/lmblec_core.c
+++ b/src/levmar/lmblec_core.c
@@ -70,7 +70,6 @@
 #define LEVMAR_BLEC_DER LM_ADD_PREFIX(levmar_blec_der)
 #define LEVMAR_BLEC_DIF LM_ADD_PREFIX(levmar_blec_dif)
 #define LEVMAR_COVAR LM_ADD_PREFIX(levmar_covar)
-#define LEVMAR_FDIF_FORW_JAC_APPROX LM_ADD_PREFIX(levmar_fdif_forw_jac_approx)

 struct LMBLEC_DATA{
  LM_REAL *x, *lb, *ub, *w;
@@ -177,7 +176,7 @@ register LM_REAL *lb, *ub, *w, tmp;
 * This function requires an analytic Jacobian. In case the latter is unavailable,
 * use LEVMAR_BLEC_DIF() bellow
 *
- * Returns the number of iterations (>=0) if successfull, LM_ERROR if failed
+ * Returns the number of iterations (>=0) if successful, LM_ERROR if failed
 *
 * For more details on the algorithm implemented by this function, please refer to
 * the comments in the top of this file.
@@ -214,6 +213,7 @@ int LEVMAR_BLEC_DER(
                      *                                 7 - stopped by invalid (i.e. NaN or Inf) "func" values. This is a user error
                      * info[7]= # function evaluations
                      * info[8]= # Jacobian evaluations
+                      * info[9]= # linear systems solved, i.e. # attempts for reducing error
                      */
  LM_REAL *work,     /* working memory at least LM_BLEC_DER_WORKSZ() reals large, allocated if NULL */
  LM_REAL *covar,    /* O: Covariance matrix corresponding to LS solution; mxm. Set to NULL if not needed. */
@@ -232,6 +232,12 @@ int LEVMAR_BLEC_DER(
    return LM_ERROR;
  }

+  if(!lb && !ub){
+    fprintf(stderr, RCAT(LCAT(LEVMAR_BLEC_DER, "(): lower and upper bounds for box constraints cannot be both NULL, use "),
+          LEVMAR_LEC_DER) "() in this case!\n");
+    return LM_ERROR;
+  }
+
  if(!LEVMAR_BOX_CHECK(lb, ub, m)){
    fprintf(stderr, LCAT(LEVMAR_BLEC_DER, "(): at least one lower bound exceeds the upper one\n"));
    return LM_ERROR;
@@ -242,7 +248,7 @@ int LEVMAR_BLEC_DER(
    data.x=(LM_REAL *)malloc((n+m)*sizeof(LM_REAL));
    if(!data.x){
      fprintf(stderr, LCAT(LEVMAR_BLEC_DER, "(): memory allocation request #1 failed\n"));
-      exit(1);
+      return LM_ERROR;
    }

    for(i=0; i<n; ++i)
@@ -253,16 +259,20 @@ int LEVMAR_BLEC_DER(
  else
    data.x=NULL;

-  data.w=(LM_REAL *)malloc(m*sizeof(LM_REAL) + m*sizeof(int));
+  data.w=(LM_REAL *)malloc(m*sizeof(LM_REAL) + m*sizeof(int)); /* should be arranged in that order for proper doubles alignment */
  if(!data.w){
    fprintf(stderr, LCAT(LEVMAR_BLEC_DER, "(): memory allocation request #2 failed\n"));
-    exit(1);
+    if(data.x) free(data.x);
+    return LM_ERROR;
  }
  data.bctype=(int *)(data.w+m);

+  /* note: at this point, one of lb, ub are not NULL */
  for(i=0; i<m; ++i){
    data.w[i]=(!wghts)? __BC_WEIGHT__ : wghts[i];
-    if(ub[i]!=LM_REAL_MAX && lb[i]!=LM_REAL_MIN) data.bctype[i]=__BC_INTERVAL__;
+    if(!lb) data.bctype[i]=__BC_HIGH__;
+    else if(!ub) data.bctype[i]=__BC_LOW__;
+    else if(ub[i]!=LM_REAL_MAX && lb[i]!=LM_REAL_MIN) data.bctype[i]=__BC_INTERVAL__;
    else if(lb[i]!=LM_REAL_MIN) data.bctype[i]=__BC_LOW__;
    else data.bctype[i]=__BC_HIGH__;
  }
@@ -318,6 +328,7 @@ int LEVMAR_BLEC_DIF(
                      *                                 7 - stopped by invalid (i.e. NaN or Inf) "func" values. This is a user error
                      * info[7]= # function evaluations
                      * info[8]= # Jacobian evaluations
+                      * info[9]= # linear systems solved, i.e. # attempts for reducing error
                      */
  LM_REAL *work,     /* working memory at least LM_BLEC_DIF_WORKSZ() reals large, allocated if NULL */
  LM_REAL *covar,    /* O: Covariance matrix corresponding to LS solution; mxm. Set to NULL if not needed. */
@@ -330,6 +341,12 @@ int LEVMAR_BLEC_DIF(
  register int i;
  LM_REAL locinfo[LM_INFO_SZ];

+  if(!lb && !ub){
+    fprintf(stderr, RCAT(LCAT(LEVMAR_BLEC_DIF, "(): lower and upper bounds for box constraints cannot be both NULL, use "),
+          LEVMAR_LEC_DIF) "() in this case!\n");
+    return LM_ERROR;
+  }
+
  if(!LEVMAR_BOX_CHECK(lb, ub, m)){
    fprintf(stderr, LCAT(LEVMAR_BLEC_DER, "(): at least one lower bound exceeds the upper one\n"));
    return LM_ERROR;
@@ -340,7 +357,7 @@ int LEVMAR_BLEC_DIF(
    data.x=(LM_REAL *)malloc((n+m)*sizeof(LM_REAL));
    if(!data.x){
      fprintf(stderr, LCAT(LEVMAR_BLEC_DER, "(): memory allocation request #1 failed\n"));
-      exit(1);
+      return LM_ERROR;
    }

    for(i=0; i<n; ++i)
@@ -351,16 +368,20 @@ int LEVMAR_BLEC_DIF(
  else
    data.x=NULL;

-  data.w=(LM_REAL *)malloc(m*sizeof(LM_REAL) + m*sizeof(int));
+  data.w=(LM_REAL *)malloc(m*sizeof(LM_REAL) + m*sizeof(int)); /* should be arranged in that order for proper doubles alignment */
  if(!data.w){
    fprintf(stderr, LCAT(LEVMAR_BLEC_DER, "(): memory allocation request #2 failed\n"));
-    exit(1);
+    if(data.x) free(data.x);
+    return LM_ERROR;
  }
  data.bctype=(int *)(data.w+m);

+  /* note: at this point, one of lb, ub are not NULL */
  for(i=0; i<m; ++i){
    data.w[i]=(!wghts)? __BC_WEIGHT__ : wghts[i];
-    if(ub[i]!=LM_REAL_MAX && lb[i]!=LM_REAL_MIN) data.bctype[i]=__BC_INTERVAL__;
+    if(!lb) data.bctype[i]=__BC_HIGH__;
+    else if(!ub) data.bctype[i]=__BC_LOW__;
+    else if(ub[i]!=LM_REAL_MAX && lb[i]!=LM_REAL_MIN) data.bctype[i]=__BC_INTERVAL__;
    else if(lb[i]!=LM_REAL_MIN) data.bctype[i]=__BC_LOW__;
    else data.bctype[i]=__BC_HIGH__;
  }
@@ -385,7 +406,6 @@ int LEVMAR_BLEC_DIF(
 #undef LMBLEC_DATA
 #undef LMBLEC_FUNC
 #undef LMBLEC_JACF
-#undef LEVMAR_FDIF_FORW_JAC_APPROX
 #undef LEVMAR_COVAR
 #undef LEVMAR_LEC_DER
 #undef LEVMAR_LEC_DIF

--- a/src/levmar/lmdemo.c
+++ b/src/levmar/lmdemo.c
@@ -685,7 +685,8 @@ char *probname[]={
 };

  opts[0]=LM_INIT_MU; opts[1]=1E-15; opts[2]=1E-15; opts[3]=1E-20;
-  opts[4]=LM_DIFF_DELTA; // relevant only if the finite difference Jacobian version is used 
+  opts[4]=LM_DIFF_DELTA; // relevant only if the Jacobian is approximated using finite differences; specifies forward differencing
+  //opts[4]=-LM_DIFF_DELTA; // specifies central differencing to approximate Jacobian; more accurate but more expensive to compute!

  /* uncomment the appropriate line below to select a minimization problem */
  problem=

--- a/src/levmar/lmlec_core.c
+++ b/src/levmar/lmlec_core.c
@@ -35,9 +35,9 @@
 #define LEVMAR_COVAR LM_ADD_PREFIX(levmar_covar)
 #define LEVMAR_FDIF_FORW_JAC_APPROX LM_ADD_PREFIX(levmar_fdif_forw_jac_approx)

-#define GEQP3 LM_ADD_PREFIX(geqp3_)
-#define ORGQR LM_ADD_PREFIX(orgqr_)
-#define TRTRI LM_ADD_PREFIX(trtri_)
+#define GEQP3 LM_MK_LAPACK_NAME(geqp3)
+#define ORGQR LM_MK_LAPACK_NAME(orgqr)
+#define TRTRI LM_MK_LAPACK_NAME(trtri)

 struct LMLEC_DATA{
  LM_REAL *c, *Z, *p, *jac;
@@ -76,7 +76,7 @@ extern int TRTRI(char *uplo, char *diag, int *n, LM_REAL *a, int *lda, int *info
 *
 * The function accepts A, b and computes c, Y, Z. If b or c is NULL, c is not
 * computed. Also, Y can be NULL in which case it is not referenced.
- * The function returns 0 in case of error, A's computed rank if successfull
+ * The function returns LM_ERROR in case of error, A's computed rank if successful
 *
 */
 static int LMLEC_ELIM(LM_REAL *A, LM_REAL *b, LM_REAL *c, LM_REAL *Y, LM_REAL *Z, int m, int n)
@@ -109,19 +109,23 @@ register int i, j, k;
  r_sz=mintmn*mintmn; // actually smaller if a is not of full row rank
  Y_sz=(Y)? 0 : tm*tn;

-  tot_sz=jpvt_sz*sizeof(int) + (a_sz + tau_sz + r_sz + worksz + Y_sz)*sizeof(LM_REAL);
+  tot_sz=(a_sz + tau_sz + r_sz + worksz + Y_sz)*sizeof(LM_REAL) + jpvt_sz*sizeof(int); /* should be arranged in that order for proper doubles alignment */
  buf=(LM_REAL *)malloc(tot_sz); /* allocate a "big" memory chunk at once */
  if(!buf){
    fprintf(stderr, RCAT("Memory allocation request failed in ", LMLEC_ELIM) "()\n");
-    exit(1);
+    return LM_ERROR;
  }

-  a=(LM_REAL *)buf;
-  jpvt=(int *)(a+a_sz);
-  tau=(LM_REAL *)(jpvt + jpvt_sz);
+  a=buf;
+  tau=a+a_sz;
  r=tau+tau_sz;
  work=r+r_sz;
-  if(!Y) Y=work+worksz;
+  if(!Y){
+    Y=work+worksz;
+    jpvt=(int *)(Y+Y_sz);
+  }
+  else
+    jpvt=(int *)(work+worksz);

  /* copy input array so that LAPACK won't destroy it. Note that copying is
   * done in row-major order, which equals A^T in column-major
@@ -139,13 +143,12 @@ register int i, j, k;
  if(info!=0){
    if(info<0){
      fprintf(stderr, RCAT(RCAT("LAPACK error: illegal value for argument %d of ", GEQP3) " in ", LMLEC_ELIM) "()\n", -info);
-      exit(1);
    }
    else if(info>0){
      fprintf(stderr, RCAT(RCAT("unknown LAPACK error (%d) for ", GEQP3) " in ", LMLEC_ELIM) "()\n", info);
-      free(buf);
-      return 0;
    }
+    free(buf);
+    return LM_ERROR;
  }
  /* the upper triangular part of a now contains the upper triangle of the unpermuted R */

@@ -168,7 +171,6 @@ register int i, j, k;
  if(rank<tn){
    fprintf(stderr, RCAT("\nConstraints matrix in ",  LMLEC_ELIM) "() is not of full row rank (i.e. %d < %d)!\n"
            "Make sure that you do not specify redundant or inconsistent constraints.\n\n", rank, tn);
-    //exit(1);
    free(buf);
    return LM_ERROR;
  }
@@ -188,13 +190,12 @@ register int i, j, k;
  if(info!=0){
    if(info<0){
      fprintf(stderr, RCAT(RCAT("LAPACK error: illegal value for argument %d of ", TRTRI) " in ", LMLEC_ELIM) "()\n", -info);
-      exit(1);
    }
    else if(info>0){
      fprintf(stderr, RCAT(RCAT("A(%d, %d) is exactly zero for ", TRTRI) " (singular matrix) in ", LMLEC_ELIM) "()\n", info, info);
-      free(buf);
-      return 0;
    }
+    free(buf);
+    return LM_ERROR;
  }
  /* then, transpose r in place */
  for(i=0; i<rank; ++i)
@@ -223,13 +224,12 @@ register int i, j, k;
  if(info!=0){
    if(info<0){
      fprintf(stderr, RCAT(RCAT("LAPACK error: illegal value for argument %d of ", ORGQR) " in ", LMLEC_ELIM) "()\n", -info);
-      exit(1);
    }
    else if(info>0){
      fprintf(stderr, RCAT(RCAT("unknown LAPACK error (%d) for ", ORGQR) " in ", LMLEC_ELIM) "()\n", info);
-      free(buf);
-      return 0;
    }
+    free(buf);
+    return LM_ERROR;
  }

  /* compute Y=Q_1*R^-T*P^T. Y is tm x rank */
@@ -399,6 +399,7 @@ int LEVMAR_LEC_DER(
                      *                                 7 - stopped by invalid (i.e. NaN or Inf) "func" values. This is a user error
                      * info[7]= # function evaluations
                      * info[8]= # Jacobian evaluations
+                      * info[9]= # linear systems solved, i.e. # attempts for reducing error
                      */
  LM_REAL *work,     /* working memory at least LM_LEC_DER_WORKSZ() reals large, allocated if NULL */
  LM_REAL *covar,    /* O: Covariance matrix corresponding to LS solution; mxm. Set to NULL if not needed. */
@@ -422,14 +423,14 @@ int LEVMAR_LEC_DER(
  mm=m-k;

  if(n<mm){
-    fprintf(stderr, LCAT(LEVMAR_LEC_DER, "(): cannot solve a problem with fewer measurements + constraints [%d + %d] than unknowns [%d]\n"), n, k, m);
+    fprintf(stderr, LCAT(LEVMAR_LEC_DER, "(): cannot solve a problem with fewer measurements + equality constraints [%d + %d] than unknowns [%d]\n"), n, k, m);
    return LM_ERROR;
  }

  ptr=(LM_REAL *)malloc((2*m + m*mm + n*m + mm)*sizeof(LM_REAL));
  if(!ptr){
    fprintf(stderr, LCAT(LEVMAR_LEC_DER, "(): memory allocation request failed\n"));
-    exit(1);
+    return LM_ERROR;
  }
  data.p=p;
  p0=ptr;
@@ -530,6 +531,7 @@ int LEVMAR_LEC_DIF(
                      *                                 7 - stopped by invalid (i.e. NaN or Inf) "func" values. This is a user error
                      * info[7]= # function evaluations
                      * info[8]= # Jacobian evaluations
+                      * info[9]= # linear systems solved, i.e. # attempts for reducing error
                      */
  LM_REAL *work,     /* working memory at least LM_LEC_DIF_WORKSZ() reals large, allocated if NULL */
  LM_REAL *covar,    /* O: Covariance matrix corresponding to LS solution; mxm. Set to NULL if not needed. */
@@ -547,14 +549,14 @@ int LEVMAR_LEC_DIF(
  mm=m-k;

  if(n<mm){
-    fprintf(stderr, LCAT(LEVMAR_LEC_DIF, "(): cannot solve a problem with fewer measurements + constraints [%d + %d] than unknowns [%d]\n"), n, k, m);
+    fprintf(stderr, LCAT(LEVMAR_LEC_DIF, "(): cannot solve a problem with fewer measurements + equality constraints [%d + %d] than unknowns [%d]\n"), n, k, m);
    return LM_ERROR;
  }

  ptr=(LM_REAL *)malloc((2*m + m*mm + mm)*sizeof(LM_REAL));
  if(!ptr){
    fprintf(stderr, LCAT(LEVMAR_LEC_DIF, "(): memory allocation request failed\n"));
-    exit(1);
+    return LM_ERROR;
  }
  data.p=p;
  p0=ptr;
@@ -618,7 +620,8 @@ int LEVMAR_LEC_DIF(
    hx=(LM_REAL *)malloc((2*n+n*m)*sizeof(LM_REAL));
    if(!hx){
      fprintf(stderr, LCAT(LEVMAR_LEC_DIF, "(): memory allocation request failed\n"));
-      exit(1);
+      free(ptr);
+      return LM_ERROR;
    }

    wrk=hx+n;

--- a/src/levmar/misc.h
+++ b/src/levmar/misc.h
@@ -20,7 +20,13 @@
 #ifndef _MISC_H_
 #define _MISC_H_

-/* common prefix for BLAS subroutines. Leave undefined in case of no prefix. You might also need to modify LM_BLAS_PREFIX below */
+/* common suffix for LAPACK subroutines. Define empty in case of no prefix. */
+#define LM_LAPACK_SUFFIX _
+//#define LM_LAPACK_SUFFIX  // define empty
+
+/* common prefix for BLAS subroutines. Leave undefined in case of no prefix.
+ * You might also need to modify LM_BLAS_PREFIX below
+ */
 /* f2c'd BLAS */
 //#define LM_BLAS_PREFIX f2c_
 /* C BLAS */
@@ -36,6 +42,9 @@
 #define RCAT_(a, b)    a #b
 #define RCAT(a, b)    RCAT_(a, b) // force substitution

+#define LM_MK_LAPACK_NAME(s)  LM_ADD_PREFIX(LM_CAT_(s, LM_LAPACK_SUFFIX))
+
+
 #define __BLOCKSZ__       32 /* block size for cache-friendly matrix-matrix multiply. It should be
                              * such that __BLOCKSZ__^2*sizeof(LM_REAL) is smaller than the CPU (L1)
                              * data cache size. Notice that a value of 32 when LM_REAL=double assumes

--- a/src/levmar/misc_core.c
+++ b/src/levmar/misc_core.c
@@ -30,6 +30,7 @@
 #define LEVMAR_COVAR LM_ADD_PREFIX(levmar_covar)
 #define LEVMAR_STDDEV LM_ADD_PREFIX(levmar_stddev)
 #define LEVMAR_CORCOEF LM_ADD_PREFIX(levmar_corcoef)
+#define LEVMAR_R2 LM_ADD_PREFIX(levmar_R2)
 #define LEVMAR_BOX_CHECK LM_ADD_PREFIX(levmar_box_check)
 #define LEVMAR_L2NRMXMY LM_ADD_PREFIX(levmar_L2nrmxmy)

@@ -47,8 +48,8 @@ static int LEVMAR_PSEUDOINVERSE(LM_REAL *A, LM_REAL *B, int m);
 extern void GEMM(char *transa, char *transb, int *m, int *n, int *k,
          LM_REAL *alpha, LM_REAL *a, int *lda, LM_REAL *b, int *ldb, LM_REAL *beta, LM_REAL *c, int *ldc);

-#define GESVD LM_ADD_PREFIX(gesvd_)
-#define GESDD LM_ADD_PREFIX(gesdd_)
+#define GESVD LM_MK_LAPACK_NAME(gesvd)
+#define GESDD LM_MK_LAPACK_NAME(gesdd)
 extern int GESVD(char *jobu, char *jobvt, int *m, int *n, LM_REAL *a, int *lda, LM_REAL *s, LM_REAL *u, int *ldu,
                 LM_REAL *vt, int *ldvt, LM_REAL *work, int *lwork, int *info);

@@ -56,8 +57,8 @@ extern int GESVD(char *jobu, char *jobvt, int *m, int *n, LM_REAL *a, int *lda,
 extern int GESDD(char *jobz, int *m, int *n, LM_REAL *a, int *lda, LM_REAL *s, LM_REAL *u, int *ldu, LM_REAL *vt, int *ldvt,
                 LM_REAL *work, int *lwork, int *iwork, int *info);

-/* cholesky decomposition */
-#define POTF2 LM_ADD_PREFIX(potf2_)
+/* Cholesky decomposition */
+#define POTF2 LM_MK_LAPACK_NAME(potf2)
 extern int POTF2(char *uplo, int *n, LM_REAL *a, int *lda, int *info);

 #define LEVMAR_CHOLESKY LM_ADD_PREFIX(levmar_chol)
@@ -70,7 +71,7 @@ static int LEVMAR_LUINVERSE(LM_REAL *A, LM_REAL *B, int m);

 /* blocked multiplication of the transpose of the nxm matrix a with itself (i.e. a^T a)
 * using a block size of bsize. The product is returned in b.
- * Since a^T a is symmetric, its computation can be speeded up by computing only its
+ * Since a^T a is symmetric, its computation can be sped up by computing only its
 * upper triangular part and copying it to the lower part.
 *
 * More details on blocking can be found at 
@@ -323,7 +324,7 @@ int fvec_sz=n, fjac_sz=n*m, pp_sz=m, fvecp_sz=n;
 * into B using SVD. A and B can coincide
 * 
 * The function returns 0 in case of error (e.g. A is singular),
- * the rank of A if successfull
+ * the rank of A if successful
 *
 * A, B are mxm
 *
@@ -341,32 +342,33 @@ LM_REAL thresh, one_over_denom;
 int info, rank, worksz, *iwork, iworksz;
   
  /* calculate required memory size */
-  worksz=16*m; /* more than needed */
+  worksz=5*m; // min worksize for GESVD
+  //worksz=m*(7*m+4); // min worksize for GESDD
  iworksz=8*m;
  a_sz=m*m;
  u_sz=m*m; s_sz=m; vt_sz=m*m;

-  tot_sz=iworksz*sizeof(int) + (a_sz + u_sz + s_sz + vt_sz + worksz)*sizeof(LM_REAL);
+  tot_sz=(a_sz + u_sz + s_sz + vt_sz + worksz)*sizeof(LM_REAL) + iworksz*sizeof(int); /* should be arranged in that order for proper doubles alignment */

    buf_sz=tot_sz;
    buf=(LM_REAL *)malloc(buf_sz);
    if(!buf){
      fprintf(stderr, RCAT("memory allocation in ", LEVMAR_PSEUDOINVERSE) "() failed!\n");
-      exit(1);
+      return 0; /* error */
    }

-  iwork=(int *)buf;
-  a=(LM_REAL *)(iwork+iworksz);
+  a=buf;
+  u=a+a_sz;
+  s=u+u_sz;
+  vt=s+s_sz;
+  work=vt+vt_sz;
+  iwork=(int *)(work+worksz);
+
  /* store A (column major!) into a */
  for(i=0; i<m; i++)
    for(j=0; j<m; j++)
      a[i+j*m]=A[i*m+j];

-  u=a + a_sz;
-  s=u+u_sz;
-  vt=s+s_sz;
-  work=vt+vt_sz;
-
  /* SVD decomposition of A */
  GESVD("A", "A", (int *)&m, (int *)&m, a, (int *)&m, s, u, (int *)&m, vt, (int *)&m, work, (int *)&worksz, &info);
  //GESDD("A", (int *)&m, (int *)&m, a, (int *)&m, s, u, (int *)&m, vt, (int *)&m, work, (int *)&worksz, iwork, &info);
@@ -375,14 +377,12 @@ int info, rank, worksz, *iwork, iworksz;
  if(info!=0){
    if(info<0){
      fprintf(stderr, RCAT(RCAT(RCAT("LAPACK error: illegal value for argument %d of ", GESVD), "/" GESDD) " in ", LEVMAR_PSEUDOINVERSE) "()\n", -info);
-      exit(1);
    }
    else{
      fprintf(stderr, RCAT("LAPACK error: dgesdd (dbdsdc)/dgesvd (dbdsqr) failed to converge in ", LEVMAR_PSEUDOINVERSE) "() [info=%d]\n", info);
-      free(buf);
-
-      return 0;
    }
+    free(buf);
+    return 0;
  }

  if(eps<0.0){
@@ -421,8 +421,7 @@ static int SVDINV(LM_REAL *a, LM_REAL *b, int m);
 *
 * A and B are mxm
 *
- * The function returns 0 in case of error,
- * 1 if successfull
+ * The function returns 0 in case of error, 1 if successful
 *
 */
 static int LEVMAR_LUINVERSE(LM_REAL *A, LM_REAL *B, int m)
@@ -439,19 +438,19 @@ LM_REAL *a, *x, *work, max, sum, tmp;
  a_sz=m*m;
  x_sz=m;
  work_sz=m;
-  tot_sz=idx_sz*sizeof(int) + (a_sz+x_sz+work_sz)*sizeof(LM_REAL);
+  tot_sz=(a_sz + x_sz + work_sz)*sizeof(LM_REAL) + idx_sz*sizeof(int); /* should be arranged in that order for proper doubles alignment */

  buf_sz=tot_sz;
  buf=(void *)malloc(tot_sz);
  if(!buf){
    fprintf(stderr, RCAT("memory allocation in ", LEVMAR_LUINVERSE) "() failed!\n");
-    exit(1);
+    return 0; /* error */
  }

-  idx=(int *)buf;
-  a=(LM_REAL *)(idx + idx_sz);
-  x=a + a_sz;
-  work=x + x_sz;
+  a=buf;
+  x=a+a_sz;
+  work=x+x_sz;
+  idx=(int *)(work+work_sz);

  /* avoid destroying A by copying it to a */
  for(i=0; i<a_sz; ++i) a[i]=A[i];
@@ -623,6 +622,44 @@ LM_REAL LEVMAR_CORCOEF(LM_REAL *covar, int m, int i, int j)
   return (LM_REAL)(covar[i*m+j]/sqrt(covar[i*m+i]*covar[j*m+j]));
 }

+/* coefficient of determination.
+ * see  http://en.wikipedia.org/wiki/Coefficient_of_determination
+ */
+LM_REAL LEVMAR_R2(void (*func)(LM_REAL *p, LM_REAL *hx, int m, int n, void *adata),
+                  LM_REAL *p, LM_REAL *x, int m, int n, void *adata)
+{
+register int i;
+register LM_REAL tmp;
+LM_REAL SSerr,  // sum of squared errors, i.e. residual sum of squares \sum_i (x_i-hx_i)^2 
+        SStot, // \sum_i (x_i-xavg)^2
+        *hx, xavg;
+
+
+  if((hx=(LM_REAL *)malloc(n*sizeof(LM_REAL)))==NULL){
+    fprintf(stderr, RCAT("memory allocation request failed in ", LEVMAR_R2) "()\n");
+    exit(1);
+  }
+
+  /* hx=f(p) */
+  (*func)(p, hx, m, n, adata);
+
+  for(i=0, tmp=0.0; i<n; ++i)
+    tmp+=x[i];
+  xavg=tmp/(LM_REAL)n;
+  
+  for(i=0, SSerr=SStot=0.0; i<n; ++i){
+    tmp=x[i]-hx[i];
+    SSerr+=tmp*tmp;
+
+    tmp=x[i]-xavg;
+    SStot+=tmp*tmp;
+  }
+
+  free(hx);
+
+  return LM_CNST(1.0) - SSerr/SStot;
+}
+
 /* check box constraints for consistency */
 int LEVMAR_BOX_CHECK(LM_REAL *lb, LM_REAL *ub, int m)
 {
@@ -638,30 +675,36 @@ register int i;

 #ifdef HAVE_LAPACK

-/* compute the Cholesky decompostion of C in W, s.t. C=W^t W and W is upper triangular */
+/* compute the Cholesky decomposition of C in W, s.t. C=W^t W and W is upper triangular */
 int LEVMAR_CHOLESKY(LM_REAL *C, LM_REAL *W, int m)
 {
 register int i, j;
 int info;

-  /* copy weights array C to W (in column-major order!) so that LAPACK won't destroy it */
-  for(i=0; i<m; i++)
-    for(j=0; j<m; j++)
-      W[i+j*m]=C[i*m+j];
+/* compute the Cholesky decomposition of C in W, s.t. C=W^t W and W is upper triangular */
+int LEVMAR_CHOLESKY(LM_REAL *C, LM_REAL *W, int m)
+{
+register int i, j;
+int info;

-  /* cholesky decomposition */
+  /* copy weights array C to W so that LAPACK won't destroy it;
+   * C is assumed symmetric, hence no transposition is needed
+   */
+  for(i=0, j=m*m; i<j; ++i)
+    W[i]=C[i];
+
+  /* Cholesky decomposition */
  POTF2("U", (int *)&m, W, (int *)&m, (int *)&info);
  /* error treatment */
  if(info!=0){
-		if(info<0){
-      fprintf(stderr, "LAPACK error: illegal value for argument %d of dpotf2 in %s\n", -info, LCAT(LEVMAR_DER, "()"));
-		  exit(1);
-		}
-		else{
-			fprintf(stderr, "LAPACK error: the leading minor of order %d is not positive definite,\n%s()\n", info,
-						RCAT("and the cholesky factorization could not be completed in ", LEVMAR_CHOLESKY));
-			return LM_ERROR;
-		}
+                if(info<0){
+      fprintf(stderr, "LAPACK error: illegal value for argument %d of dpotf2 in %s\n", -info, LCAT(LEVMAR_CHOLESKY, "()"));
+                }
+                else{
+                        fprintf(stderr, "LAPACK error: the leading minor of order %d is not positive definite,\n%s()\n", info,
+                                                RCAT("and the Cholesky factorization could not be completed in ", LEVMAR_CHOLESKY));
+                }
+    return LM_ERROR;
  }

  /* the decomposition is in the upper part of W (in column-major order!).
@@ -699,17 +742,17 @@ register LM_REAL sum0=0.0, sum1=0.0, sum2=0.0, sum3=0.0;
   */ 
  blockn = (n>>bpwr)<<bpwr; /* (n / blocksize) * blocksize; */

+  /* unroll the loop in blocks of `blocksize'; looping downwards gains some more speed */
  if(x){
-    /* unroll the loop in blocks of `blocksize' */
-    for(i=0; i<blockn; i+=blocksize){
+    for(i=blockn-1; i>0; i-=blocksize){
              e[i ]=x[i ]-y[i ]; sum0+=e[i ]*e[i ];
-      j1=i+1; e[j1]=x[j1]-y[j1]; sum1+=e[j1]*e[j1];
-      j2=i+2; e[j2]=x[j2]-y[j2]; sum2+=e[j2]*e[j2];
-      j3=i+3; e[j3]=x[j3]-y[j3]; sum3+=e[j3]*e[j3];
-      j4=i+4; e[j4]=x[j4]-y[j4]; sum0+=e[j4]*e[j4];
-      j5=i+5; e[j5]=x[j5]-y[j5]; sum1+=e[j5]*e[j5];
-      j6=i+6; e[j6]=x[j6]-y[j6]; sum2+=e[j6]*e[j6];
-      j7=i+7; e[j7]=x[j7]-y[j7]; sum3+=e[j7]*e[j7];
+      j1=i-1; e[j1]=x[j1]-y[j1]; sum1+=e[j1]*e[j1];
+      j2=i-2; e[j2]=x[j2]-y[j2]; sum2+=e[j2]*e[j2];
+      j3=i-3; e[j3]=x[j3]-y[j3]; sum3+=e[j3]*e[j3];
+      j4=i-4; e[j4]=x[j4]-y[j4]; sum0+=e[j4]*e[j4];
+      j5=i-5; e[j5]=x[j5]-y[j5]; sum1+=e[j5]*e[j5];
+      j6=i-6; e[j6]=x[j6]-y[j6]; sum2+=e[j6]*e[j6];
+      j7=i-7; e[j7]=x[j7]-y[j7]; sum3+=e[j7]*e[j7];
    }

   /*
@@ -718,6 +761,7 @@ register LM_REAL sum0=0.0, sum1=0.0, sum2=0.0, sum3=0.0;
    * but a switch is faster (and more interesting) 
    */ 

+    i=blockn;
    if(i<n){ 
      /* Jump into the case at the place that will allow
       * us to finish off the appropriate number of items. 
@@ -735,16 +779,15 @@ register LM_REAL sum0=0.0, sum1=0.0, sum2=0.0, sum3=0.0;
    }
  }
  else{ /* x==0 */
-    /* unroll the loop in blocks of `blocksize' */
-    for(i=0; i<blockn; i+=blocksize){
+    for(i=blockn-1; i>0; i-=blocksize){
              e[i ]=-y[i ]; sum0+=e[i ]*e[i ];
-      j1=i+1; e[j1]=-y[j1]; sum1+=e[j1]*e[j1];
-      j2=i+2; e[j2]=-y[j2]; sum2+=e[j2]*e[j2];
-      j3=i+3; e[j3]=-y[j3]; sum3+=e[j3]*e[j3];
-      j4=i+4; e[j4]=-y[j4]; sum0+=e[j4]*e[j4];
-      j5=i+5; e[j5]=-y[j5]; sum1+=e[j5]*e[j5];
-      j6=i+6; e[j6]=-y[j6]; sum2+=e[j6]*e[j6];
-      j7=i+7; e[j7]=-y[j7]; sum3+=e[j7]*e[j7];
+      j1=i-1; e[j1]=-y[j1]; sum1+=e[j1]*e[j1];
+      j2=i-2; e[j2]=-y[j2]; sum2+=e[j2]*e[j2];
+      j3=i-3; e[j3]=-y[j3]; sum3+=e[j3]*e[j3];
+      j4=i-4; e[j4]=-y[j4]; sum0+=e[j4]*e[j4];
+      j5=i-5; e[j5]=-y[j5]; sum1+=e[j5]*e[j5];
+      j6=i-6; e[j6]=-y[j6]; sum2+=e[j6]*e[j6];
+      j7=i-7; e[j7]=-y[j7]; sum3+=e[j7]*e[j7];
    }

   /*
@@ -753,6 +796,7 @@ register LM_REAL sum0=0.0, sum1=0.0, sum2=0.0, sum3=0.0;
    * but a switch is faster (and more interesting) 
    */ 

+    i=blockn;
    if(i<n){ 
      /* Jump into the case at the place that will allow
       * us to finish off the appropriate number of items. 
@@ -794,7 +838,7 @@ static int SVDINV(LM_REAL *a, LM_REAL *b, int m)
                                  (ct=bt/at,at*sqrt(1.0+ct*ct)) \
                                : (bt ? (ct=at/bt,bt*sqrt(1.0+ct*ct)): 0.0))
 #define SIGN(a,b) ((b) >= 0.0 ? fabs(a) : -fabs(a))
-#define TOL             1.0e-11
+#define TOL             1.0e-6

   int                  flag,i,its,j,jj,k,l,mmi,nm, nml, rank;
   LM_REAL              *vmat,*wmat,
@@ -804,7 +848,6 @@ static int SVDINV(LM_REAL *a, LM_REAL *b, int m)
                        anorm, g, scale,
                        at,bt,ct,maxarg1,maxarg2,
                        thresh, wmax;
-
  anorm = g = scale = 0.0;
  
  rv1=(LM_REAL *)malloc(m*sizeof(LM_REAL));
@@ -953,7 +996,7 @@ static int SVDINV(LM_REAL *a, LM_REAL *b, int m)
        for (l=k;l>=0;l--)
          {
          nm=l-1;
-          if (fabs(rv1[l]) <= anorm*TOL)
+          if (!l || fabs(rv1[l]) <= anorm*TOL)
            {
            flag=0;
            break;
@@ -1103,6 +1146,7 @@ static int SVDINV(LM_REAL *a, LM_REAL *b, int m)
 #undef LEVMAR_COVAR
 #undef LEVMAR_STDDEV
 #undef LEVMAR_CORCOEF
+#undef LEVMAR_R2
 #undef LEVMAR_CHKJAC
 #undef LEVMAR_FDIF_FORW_JAC_APPROX
 #undef LEVMAR_FDIF_CENT_JAC_APPROX