Commit 534cebef authored by Emmanuel Bertin's avatar Emmanuel Bertin
Browse files

Fixed issue with SVD on some objects.

Updated the LevMar library to V2.4.
Pushed version number to 2.9.7.
parent 882ad557
#! /bin/sh #! /bin/sh
# Guess values for system-dependent variables and create Makefiles. # Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.63 for sextractor 2.9.6. # Generated by GNU Autoconf 2.63 for sextractor 2.9.7.
# #
# Report bugs to <bertin@iap.fr>. # Report bugs to <bertin@iap.fr>.
# #
...@@ -750,8 +750,8 @@ SHELL=${CONFIG_SHELL-/bin/sh} ...@@ -750,8 +750,8 @@ SHELL=${CONFIG_SHELL-/bin/sh}
# Identity of this package. # Identity of this package.
PACKAGE_NAME='sextractor' PACKAGE_NAME='sextractor'
PACKAGE_TARNAME='sextractor' PACKAGE_TARNAME='sextractor'
PACKAGE_VERSION='2.9.6' PACKAGE_VERSION='2.9.7'
PACKAGE_STRING='sextractor 2.9.6' PACKAGE_STRING='sextractor 2.9.7'
PACKAGE_BUGREPORT='bertin@iap.fr' PACKAGE_BUGREPORT='bertin@iap.fr'
   
ac_unique_file="src/makeit.c" ac_unique_file="src/makeit.c"
...@@ -1505,7 +1505,7 @@ if test "$ac_init_help" = "long"; then ...@@ -1505,7 +1505,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing. # Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh. # This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF cat <<_ACEOF
\`configure' configures sextractor 2.9.6 to adapt to many kinds of systems. \`configure' configures sextractor 2.9.7 to adapt to many kinds of systems.
   
Usage: $0 [OPTION]... [VAR=VALUE]... Usage: $0 [OPTION]... [VAR=VALUE]...
   
...@@ -1575,7 +1575,7 @@ fi ...@@ -1575,7 +1575,7 @@ fi
   
if test -n "$ac_init_help"; then if test -n "$ac_init_help"; then
case $ac_init_help in case $ac_init_help in
short | recursive ) echo "Configuration of sextractor 2.9.6:";; short | recursive ) echo "Configuration of sextractor 2.9.7:";;
esac esac
cat <<\_ACEOF cat <<\_ACEOF
   
...@@ -1706,7 +1706,7 @@ fi ...@@ -1706,7 +1706,7 @@ fi
test -n "$ac_init_help" && exit $ac_status test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then if $ac_init_version; then
cat <<\_ACEOF cat <<\_ACEOF
sextractor configure 2.9.6 sextractor configure 2.9.7
generated by GNU Autoconf 2.63 generated by GNU Autoconf 2.63
   
Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001, Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
...@@ -1720,7 +1720,7 @@ cat >config.log <<_ACEOF ...@@ -1720,7 +1720,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake. running configure, to aid debugging if configure makes a mistake.
   
It was created by sextractor $as_me 2.9.6, which was It was created by sextractor $as_me 2.9.7, which was
generated by GNU Autoconf 2.63. Invocation command line was generated by GNU Autoconf 2.63. Invocation command line was
   
$ $0 $@ $ $0 $@
...@@ -2423,7 +2423,7 @@ fi ...@@ -2423,7 +2423,7 @@ fi
   
# Define the identity of the package. # Define the identity of the package.
PACKAGE='sextractor' PACKAGE='sextractor'
VERSION='2.9.6' VERSION='2.9.7'
   
   
cat >>confdefs.h <<_ACEOF cat >>confdefs.h <<_ACEOF
...@@ -28291,7 +28291,7 @@ exec 6>&1 ...@@ -28291,7 +28291,7 @@ exec 6>&1
# report actual input values of CONFIG_FILES etc. instead of their # report actual input values of CONFIG_FILES etc. instead of their
# values after options handling. # values after options handling.
ac_log=" ac_log="
This file was extended by sextractor $as_me 2.9.6, which was This file was extended by sextractor $as_me 2.9.7, which was
generated by GNU Autoconf 2.63. Invocation command line was generated by GNU Autoconf 2.63. Invocation command line was
   
CONFIG_FILES = $CONFIG_FILES CONFIG_FILES = $CONFIG_FILES
...@@ -28354,7 +28354,7 @@ Report bugs to <bug-autoconf@gnu.org>." ...@@ -28354,7 +28354,7 @@ Report bugs to <bug-autoconf@gnu.org>."
_ACEOF _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_version="\\ ac_cs_version="\\
sextractor config.status 2.9.6 sextractor config.status 2.9.7
configured by $0, generated by GNU Autoconf 2.63, configured by $0, generated by GNU Autoconf 2.63,
with options \\"`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`\\" with options \\"`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`\\"
   
......
...@@ -6,7 +6,7 @@ define([AC_CACHE_LOAD],) ...@@ -6,7 +6,7 @@ define([AC_CACHE_LOAD],)
define([AC_CACHE_SAVE],) define([AC_CACHE_SAVE],)
# This is your standard Bertin source code... # This is your standard Bertin source code...
AC_INIT(sextractor, 2.9.6, [bertin@iap.fr]) AC_INIT(sextractor, 2.9.7, [bertin@iap.fr])
AC_CONFIG_SRCDIR(src/makeit.c) AC_CONFIG_SRCDIR(src/makeit.c)
AC_CONFIG_AUX_DIR(autoconf) AC_CONFIG_AUX_DIR(autoconf)
AM_CONFIG_HEADER(config.h) AM_CONFIG_HEADER(config.h)
......
.TH SEXTRACTOR "1" "October 2009" "SWarp 2.9.6" "User Commands" .TH SEXTRACTOR "1" "October 2009" "SWarp 2.9.7" "User Commands"
.SH NAME .SH NAME
sex \- extract a source catalog from an astronomical FITS image sex \- extract a source catalog from an astronomical FITS image
.SH SYNOPSIS .SH SYNOPSIS
......
...@@ -36,15 +36,17 @@ ...@@ -36,15 +36,17 @@
/* prototypes of LAPACK routines */ /* prototypes of LAPACK routines */
#define GEQRF LM_ADD_PREFIX(geqrf_)
#define ORGQR LM_ADD_PREFIX(orgqr_) #define GEQRF LM_MK_LAPACK_NAME(geqrf)
#define TRTRS LM_ADD_PREFIX(trtrs_) #define ORGQR LM_MK_LAPACK_NAME(orgqr)
#define POTF2 LM_ADD_PREFIX(potf2_) #define TRTRS LM_MK_LAPACK_NAME(trtrs)
#define POTRF LM_ADD_PREFIX(potrf_) #define POTF2 LM_MK_LAPACK_NAME(potf2)
#define GETRF LM_ADD_PREFIX(getrf_) #define POTRF LM_MK_LAPACK_NAME(potrf)
#define GETRS LM_ADD_PREFIX(getrs_) #define POTRS LM_MK_LAPACK_NAME(potrs)
#define GESVD LM_ADD_PREFIX(gesvd_) #define GETRF LM_MK_LAPACK_NAME(getrf)
#define GESDD LM_ADD_PREFIX(gesdd_) #define GETRS LM_MK_LAPACK_NAME(getrs)
#define GESVD LM_MK_LAPACK_NAME(gesvd)
#define GESDD LM_MK_LAPACK_NAME(gesdd)
/* QR decomposition */ /* QR decomposition */
extern int GEQRF(int *m, int *n, LM_REAL *a, int *lda, LM_REAL *tau, LM_REAL *work, int *lwork, int *info); extern int GEQRF(int *m, int *n, LM_REAL *a, int *lda, LM_REAL *tau, LM_REAL *work, int *lwork, int *info);
...@@ -53,9 +55,10 @@ extern int ORGQR(int *m, int *n, int *k, LM_REAL *a, int *lda, LM_REAL *tau, LM_ ...@@ -53,9 +55,10 @@ extern int ORGQR(int *m, int *n, int *k, LM_REAL *a, int *lda, LM_REAL *tau, LM_
/* solution of triangular systems */ /* solution of triangular systems */
extern int TRTRS(char *uplo, char *trans, char *diag, int *n, int *nrhs, LM_REAL *a, int *lda, LM_REAL *b, int *ldb, int *info); extern int TRTRS(char *uplo, char *trans, char *diag, int *n, int *nrhs, LM_REAL *a, int *lda, LM_REAL *b, int *ldb, int *info);
/* cholesky decomposition */ /* Cholesky decomposition and systems solution */
extern int POTF2(char *uplo, int *n, LM_REAL *a, int *lda, int *info); extern int POTF2(char *uplo, int *n, LM_REAL *a, int *lda, int *info);
extern int POTRF(char *uplo, int *n, LM_REAL *a, int *lda, int *info); /* block version of dpotf2 */ extern int POTRF(char *uplo, int *n, LM_REAL *a, int *lda, int *info); /* block version of dpotf2 */
extern int POTRS(char *uplo, int *n, int *nrhs, LM_REAL *a, int *lda, LM_REAL *b, int *ldb, int *info);
/* LU decomposition and systems solution */ /* LU decomposition and systems solution */
extern int GETRF(int *m, int *n, LM_REAL *a, int *lda, int *ipiv, int *info); extern int GETRF(int *m, int *n, LM_REAL *a, int *lda, int *ipiv, int *info);
...@@ -88,7 +91,7 @@ extern int GESDD(char *jobz, int *m, int *n, LM_REAL *a, int *lda, LM_REAL *s, L ...@@ -88,7 +91,7 @@ extern int GESDD(char *jobz, int *m, int *n, LM_REAL *a, int *lda, LM_REAL *s, L
* *
* A is mxm, b is mx1 * A is mxm, b is mx1
* *
* The function returns 0 in case of error, 1 if successfull * The function returns 0 in case of error, 1 if successful
* *
* This function is often called repetitively to solve problems of identical * This function is often called repetitively to solve problems of identical
* dimensions. To avoid repetitive malloc's and free's, allocated memory is * dimensions. To avoid repetitive malloc's and free's, allocated memory is
...@@ -100,6 +103,8 @@ int AX_EQ_B_QR(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m) ...@@ -100,6 +103,8 @@ int AX_EQ_B_QR(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m)
__STATIC__ LM_REAL *buf=NULL; __STATIC__ LM_REAL *buf=NULL;
__STATIC__ int buf_sz=0; __STATIC__ int buf_sz=0;
static int nb=0; /* no __STATIC__ decl. here! */
LM_REAL *a, *qtb, *tau, *r, *work; LM_REAL *a, *qtb, *tau, *r, *work;
int a_sz, qtb_sz, tau_sz, r_sz, tot_sz; int a_sz, qtb_sz, tau_sz, r_sz, tot_sz;
register int i, j; register int i, j;
...@@ -123,7 +128,15 @@ register LM_REAL sum; ...@@ -123,7 +128,15 @@ register LM_REAL sum;
qtb_sz=m; qtb_sz=m;
tau_sz=m; tau_sz=m;
r_sz=m*m; /* only the upper triangular part really needed */ r_sz=m*m; /* only the upper triangular part really needed */
worksz=3*m; /* this is probably too much */
if(!nb){
LM_REAL tmp;
worksz=-1; // workspace query; optimal size is returned
GEQRF((int *)&m, (int *)&m, NULL, (int *)&m, NULL, (LM_
nb=((int)tmp)/m; // optimal worksize is m*nb
}
worksz=nb*m;
tot_sz=a_sz + qtb_sz + tau_sz + r_sz + worksz; tot_sz=a_sz + qtb_sz + tau_sz + r_sz + worksz;
#ifdef LINSOLVERS_RETAIN_MEMORY #ifdef LINSOLVERS_RETAIN_MEMORY
...@@ -244,7 +257,7 @@ register LM_REAL sum; ...@@ -244,7 +257,7 @@ register LM_REAL sum;
* *
* A is mxn, b is mx1 * A is mxn, b is mx1
* *
* The function returns 0 in case of error, 1 if successfull * The function returns 0 in case of error, 1 if successful
* *
* This function is often called repetitively to solve problems of identical * This function is often called repetitively to solve problems of identical
* dimensions. To avoid repetitive malloc's and free's, allocated memory is * dimensions. To avoid repetitive malloc's and free's, allocated memory is
...@@ -256,6 +269,8 @@ int AX_EQ_B_QRLS(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m, int n) ...@@ -256,6 +269,8 @@ int AX_EQ_B_QRLS(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m, int n)
__STATIC__ LM_REAL *buf=NULL; __STATIC__ LM_REAL *buf=NULL;
__STATIC__ int buf_sz=0; __STATIC__ int buf_sz=0;
static int nb=0; /* no __STATIC__ decl. here! */
LM_REAL *a, *atb, *tau, *r, *work; LM_REAL *a, *atb, *tau, *r, *work;
int a_sz, atb_sz, tau_sz, r_sz, tot_sz; int a_sz, atb_sz, tau_sz, r_sz, tot_sz;
register int i, j; register int i, j;
...@@ -284,7 +299,14 @@ register LM_REAL sum; ...@@ -284,7 +299,14 @@ register LM_REAL sum;
atb_sz=n; atb_sz=n;
tau_sz=n; tau_sz=n;
r_sz=n*n; r_sz=n*n;
worksz=3*n; /* this is probably too much */ if(!nb){
LM_REAL tmp;
worksz=-1; // workspace query; optimal size is returned
GEQRF((int *)&m, (int *)&m, NULL, (int *)&m, NULL, (LM_
nb=((int)tmp)/m; // optimal worksize is m*nb
}
worksz=nb*m;
tot_sz=a_sz + atb_sz + tau_sz + r_sz + worksz; tot_sz=a_sz + atb_sz + tau_sz + r_sz + worksz;
#ifdef LINSOLVERS_RETAIN_MEMORY #ifdef LINSOLVERS_RETAIN_MEMORY
...@@ -411,7 +433,7 @@ register LM_REAL sum; ...@@ -411,7 +433,7 @@ register LM_REAL sum;
* *
* A is mxm, b is mx1 * A is mxm, b is mx1
* *
* The function returns 0 in case of error, 1 if successfull * The function returns 0 in case of error, 1 if successful
* *
* This function is often called repetitively to solve problems of identical * This function is often called repetitively to solve problems of identical
* dimensions. To avoid repetitive malloc's and free's, allocated memory is * dimensions. To avoid repetitive malloc's and free's, allocated memory is
...@@ -425,7 +447,7 @@ __STATIC__ int buf_sz=0; ...@@ -425,7 +447,7 @@ __STATIC__ int buf_sz=0;
LM_REAL *a, *b; LM_REAL *a, *b;
int a_sz, b_sz, tot_sz; int a_sz, b_sz, tot_sz;
register int i, j; register int i;
int info, nrhs=1; int info, nrhs=1;
if(!A) if(!A)
...@@ -468,24 +490,29 @@ int info, nrhs=1; ...@@ -468,24 +490,29 @@ int info, nrhs=1;
a=buf; a=buf;
b=a+a_sz; b=a+a_sz;
/* store A (column major!) into a anb B into b */ /* store A into a anb B into b. A is assumed symmetric,
for(i=0; i<m; i++){ * hence no transposition is needed
for(j=0; j<m; j++) */
a[i+j*m]=A[i*m+j]; for(i=0; i<m; i++){
a[i]=A[i];
b[i]=B[i]; b[i]=B[i];
} }
for(i=m; i<m*m; i++)
a[i]=A[i];
/* Cholesky decomposition of A */ /* Cholesky decomposition of A */
POTF2("U", (int *)&m, a, (int *)&m, (int *)&info); //POTF2("U", (int *)&m, a, (int *)&m, (int *)&info);
POTRF("U", (int *)&m, a, (int *)&m, (int *)&info);
/* error treatment */ /* error treatment */
if(info!=0){ if(info!=0){
if(info<0){ if(info<0){
fprintf(stderr, RCAT(RCAT("LAPACK error: illegal value for argument %d of ", POTF2) " in ", AX_EQ_B_CHOL) "()\n", -info); fprintf(stderr, RCAT(RCAT(RCAT("LAPACK error: illegal value for argument %d of ", POTF2) "/", POTRF) " in ",
AX_EQ_B_CHOL) "()\n", -info);
exit(1); exit(1);
} }
else{ else{
fprintf(stderr, RCAT(RCAT("LAPACK error: the leading minor of order %d is not positive definite,\nthe factorization could not be completed for ", POTF2) " in ", AX_EQ_B_CHOL) "()\n", info); fprintf(stderr, RCAT(RCAT(RCAT("LAPACK error: the leading minor of order %d is not positive definite,\nthe factorization could not be completed for ", POTF2) "/", POTRF), " in ",
AX_EQ_B_CHOL) "()\n", info);
#ifndef LINSOLVERS_RETAIN_MEMORY #ifndef LINSOLVERS_RETAIN_MEMORY
free(buf); free(buf);
#endif #endif
...@@ -494,7 +521,15 @@ int info, nrhs=1; ...@@ -494,7 +521,15 @@ int info, nrhs=1;
} }
} }
/* solve the linear system U^T y = b */ /* solve using the computed Cholesky in one lapack call */
POTRS("U", (int *)&m, (int *)&nrhs, a, (int *)&m, b, (int *)&m, &info);
if(info<0){
fprintf(stderr, RCAT(RCAT("LAPACK error: illegal value for argument %d of ", POTRS) " in ", AX_EQ_B_CHOL) "()\n", -info);
exit(1);
}
#if 0
/* alternative: solve the linear system U^T y = b ... */
TRTRS("U", "T", "N", (int *)&m, (int *)&nrhs, a, (int *)&m, b, (int *)&m, &info); TRTRS("U", "T", "N", (int *)&m, (int *)&nrhs, a, (int *)&m, b, (int *)&m, &info);
/* error treatment */ /* error treatment */
if(info!=0){ if(info!=0){
...@@ -512,7 +547,7 @@ int info, nrhs=1; ...@@ -512,7 +547,7 @@ int info, nrhs=1;
} }
} }
/* solve the linear system U x = y */ /* ... solve the linear system U x = y */
TRTRS("U", "N", "N", (int *)&m, (int *)&nrhs, a, (int *)&m, b, (int *)&m, &info); TRTRS("U", "N", "N", (int *)&m, (int *)&nrhs, a, (int *)&m, b, (int *)&m, &info);
/* error treatment */ /* error treatment */
if(info!=0){ if(info!=0){
...@@ -529,6 +564,7 @@ int info, nrhs=1; ...@@ -529,6 +564,7 @@ int info, nrhs=1;
return 0; return 0;
} }
} }
#endif /* 0 */
/* copy the result in x */ /* copy the result in x */
for(i=0; i<m; i++) for(i=0; i<m; i++)
...@@ -551,8 +587,7 @@ int info, nrhs=1; ...@@ -551,8 +587,7 @@ int info, nrhs=1;
* *
* A is mxm, b is mx1 * A is mxm, b is mx1
* *
* The function returns 0 in case of error, * The function returns 0 in case of error, 1 if successful
* 1 if successfull
* *
* This function is often called repetitively to solve problems of identical * This function is often called repetitively to solve problems of identical
* dimensions. To avoid repetitive malloc's and free's, allocated memory is * dimensions. To avoid repetitive malloc's and free's, allocated memory is
...@@ -564,10 +599,10 @@ int AX_EQ_B_LU(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m) ...@@ -564,10 +599,10 @@ int AX_EQ_B_LU(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m)
__STATIC__ LM_REAL *buf=NULL; __STATIC__ LM_REAL *buf=NULL;
__STATIC__ int buf_sz=0; __STATIC__ int buf_sz=0;
int a_sz, ipiv_sz, b_sz, work_sz, tot_sz; int a_sz, ipiv_sz, b_sz, tot_sz;
register int i, j; register int i, j;
int info, *ipiv, nrhs=1; int info, *ipiv, nrhs=1;
LM_REAL *a, *b, *work; LM_REAL *a, *b;
if(!A) if(!A)
#ifdef LINSOLVERS_RETAIN_MEMORY #ifdef LINSOLVERS_RETAIN_MEMORY
...@@ -585,15 +620,14 @@ LM_REAL *a, *b, *work; ...@@ -585,15 +620,14 @@ LM_REAL *a, *b, *work;
ipiv_sz=m; ipiv_sz=m;
a_sz=m*m; a_sz=m*m;
b_sz=m; b_sz=m;
work_sz=100*m; /* this is probably too much */ tot_sz=(a_sz + b_sz)*sizeof(LM_REAL) + ipiv_sz*sizeof(int); /* should be arranged in that order for proper doubles alignment */
tot_sz=ipiv_sz + a_sz + b_sz + work_sz; // ipiv_sz counted as LM_REAL here, no harm is done though
#ifdef LINSOLVERS_RETAIN_MEMORY #ifdef LINSOLVERS_RETAIN_MEMORY
if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */ if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */
if(buf) free(buf); /* free previously allocated memory */ if(buf) free(buf); /* free previously allocated memory */
buf_sz=tot_sz; buf_sz=tot_sz;
buf=(LM_REAL *)malloc(buf_sz*sizeof(LM_REAL)); buf=(LM_REAL *)malloc(buf_sz);
if(!buf){ if(!buf){
fprintf(stderr, RCAT("memory allocation in ", AX_EQ_B_LU) "() failed!\n"); fprintf(stderr, RCAT("memory allocation in ", AX_EQ_B_LU) "() failed!\n");
exit(1); exit(1);
...@@ -601,17 +635,16 @@ LM_REAL *a, *b, *work; ...@@ -601,17 +635,16 @@ LM_REAL *a, *b, *work;
} }
#else #else
buf_sz=tot_sz; buf_sz=tot_sz;
buf=(LM_REAL *)malloc(buf_sz*sizeof(LM_REAL)); buf=(LM_REAL *)malloc(buf_sz);
if(!buf){ if(!buf){
fprintf(stderr, RCAT("memory allocation in ", AX_EQ_B_LU) "() failed!\n"); fprintf(stderr, RCAT("memory allocation in ", AX_EQ_B_LU) "() failed!\n");
exit(1); exit(1);
} }
#endif /* LINSOLVERS_RETAIN_MEMORY */ #endif /* LINSOLVERS_RETAIN_MEMORY */
ipiv=(int *)buf; a=buf;
a=(LM_REAL *)(ipiv + ipiv_sz);
b=a+a_sz; b=a+a_sz;
work=b+b_sz; ipiv=(int *)(b+b_sz);
/* store A (column major!) into a and B into b */ /* store A (column major!) into a and B into b */
for(i=0; i<m; i++){ for(i=0; i<m; i++){
...@@ -677,7 +710,7 @@ LM_REAL *a, *b, *work; ...@@ -677,7 +710,7 @@ LM_REAL *a, *b, *work;
* *
* A is mxm, b is mx1. * A is mxm, b is mx1.
* *
* The function returns 0 in case of error, 1 if successfull * The function returns 0 in case of error, 1 if successful
* *
* This function is often called repetitively to solve problems of identical * This function is often called repetitively to solve problems of identical
* dimensions. To avoid repetitive malloc's and free's, allocated memory is * dimensions. To avoid repetitive malloc's and free's, allocated memory is
...@@ -710,12 +743,21 @@ int info, rank, worksz, *iwork, iworksz; ...@@ -710,12 +743,21 @@ int info, rank, worksz, *iwork, iworksz;
#endif /* LINSOLVERS_RETAIN_MEMORY */ #endif /* LINSOLVERS_RETAIN_MEMORY */
/* calculate required memory size */ /* calculate required memory size */
worksz=16*m; /* more than needed */ #if 1 /* use optimal size */
worksz=-1; // workspace query. Keep in mind that GESDD requires more memory than GESVD
/* note that optimal work size is returned in thresh */
GESVD("A", "A", (int *)&m, (int *)&m, NULL, (int *)&m, NULL, NULL, (int *)&m, NULL, (int *)&m, (LM_REAL *)&thresh, (int *)&worksz, &info);
//GESDD("A", (int *)&m, (int *)&m, NULL, (int *)&m, NULL, NULL, (int *)&m, NULL, (int *)&m, (LM_REAL *)&thresh, (int *)&worksz, NULL, &info);
worksz=(int)thresh;
#else /* use minimum size */
worksz=5*m; // min worksize for GESVD
//worksz=m*(7*m+4); // min worksize for GESDD
#endif
iworksz=8*m; iworksz=8*m;
a_sz=m*m; a_sz=m*m;
u_sz=m*m; s_sz=m; vt_sz=m*m; u_sz=m*m; s_sz=m; vt_sz=m*m;
tot_sz=iworksz*sizeof(int) + (a_sz + u_sz + s_sz + vt_sz + worksz)*sizeof(LM_REAL); tot_sz=(a_sz + u_sz + s_sz + vt_sz + worksz)*sizeof(LM_REAL) + iworksz*sizeof(int); /* should be arranged in that order for proper doubles alignment */
#ifdef LINSOLVERS_RETAIN_MEMORY #ifdef LINSOLVERS_RETAIN_MEMORY
if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */ if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */
...@@ -737,18 +779,18 @@ int info, rank, worksz, *iwork, iworksz; ...@@ -737,18 +779,18 @@ int info, rank, worksz, *iwork, iworksz;
} }
#endif /* LINSOLVERS_RETAIN_MEMORY */ #endif /* LINSOLVERS_RETAIN_MEMORY */
iwork=(int *)buf; a=buf;
a=(LM_REAL *)(iwork+iworksz); u=a+a_sz;
s=u+u_sz;
vt=s+s_sz;
work=vt+vt_sz;
iwork=(int *)(work+worksz);
/* store A (column major!) into a */ /* store A (column major!) into a */
for(i=0; i<m; i++) for(i=0; i<m; i++)
for(j=0; j<m; j++) for(j=0; j<m; j++)
a[i+j*m]=A[i*m+j]; a[i+j*m]=A[i*m+j];
u=a + a_sz;
s=u+u_sz;
vt=s+s_sz;
work=vt+vt_sz;
/* SVD decomposition of A */ /* SVD decomposition of A */
GESVD("A", "A", (int *)&m, (int *)&m, a, (int *)&m, s, u, (int *)&m, vt, (int *)&m, work, (int *)&worksz, &info); GESVD("A", "A", (int *)&m, (int *)&m, a, (int *)&m, s, u, (int *)&m, vt, (int *)&m, work, (int *)&worksz, &info);
//GESDD("A", (int *)&m, (int *)&m, a, (int *)&m, s, u, (int *)&m, vt, (int *)&m, work, (int *)&worksz, iwork, &info); //GESDD("A", (int *)&m, (int *)&m, a, (int *)&m, s, u, (int *)&m, vt, (int *)&m, work, (int *)&worksz, iwork, &info);
...@@ -845,8 +887,7 @@ extern int GETRS(char *trans, int *n, int *nrhs, LM_REAL *a, int *lda, int *ipiv ...@@ -845,8 +887,7 @@ extern int GETRS(char *trans, int *n, int *nrhs, LM_REAL *a, int *lda, int *ipiv
* *
* A is mxm, b is mx1 * A is mxm, b is mx1
* *
* The function returns 0 in case of error, * The function returns 0 in case of error, 1 if successful
* 1 if successfull
* *
* This function is often called repetitively to solve problems of identical * This function is often called repetitively to solve problems of identical
* dimensions. To avoid repetitive malloc's and free's, allocated memory is * dimensions. To avoid repetitive malloc's and free's, allocated memory is
...@@ -862,7 +903,7 @@ __STATIC__ int buf_sz=0; ...@@ -862,7 +903,7 @@ __STATIC__ int buf_sz=0;
int a_sz, ipiv_sz, b_sz, work_sz, tot_sz; int a_sz, ipiv_sz, b_sz, work_sz, tot_sz;
register int i, j; register int i, j;
int info, *ipiv; int info, *ipiv;
LM_REAL *a, *b, *work; LM_REAL *a, *b;
if(!A) if(!A)
#ifdef LINSOLVERS_RETAIN_MEMORY #ifdef LINSOLVERS_RETAIN_MEMORY
{ {
...@@ -879,15 +920,14 @@ LM_REAL *a, *b, *work; ...@@ -879,15 +920,14 @@ LM_REAL *a, *b, *work;
ipiv_sz=m; ipiv_sz=m;
a_sz=m*m; a_sz=m*m;
b_sz=m; b_sz=m;
work_sz=100*m; /* this is probably too much */ tot_sz=(ipiv_sz + a_sz + b_sz)*sizeof(LM_REAL);
tot_sz=ipiv_sz + a_sz + b_sz + work_sz; // ipiv_sz counted as LM_REAL here, no harm is done though
#ifdef LINSOLVERS_RETAIN_MEMORY #ifdef LINSOLVERS_RETAIN_MEMORY
if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */ if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */
if(buf) free(buf); /* free previously allocated memory */ if(buf) free(buf); /* free previously allocated memory */
buf_sz=tot_sz; buf_sz=tot_sz;
buf=(LM_REAL *)malloc(buf_sz*sizeof(LM_REAL)); buf=(void *)malloc(buf_sz);
if(!buf){ if(!buf){
fprintf(stderr, RCAT("memory allocation in ", AX_EQ_B_LU) "() failed!\n"); fprintf(stderr, RCAT("memory allocation in ", AX_EQ_B_LU) "() failed!\n");
exit(1); exit(1);
...@@ -895,7 +935,7 @@ LM_REAL *a, *b, *work; ...@@ -895,7 +935,7 @@ LM_REAL *a, *b, *work;
} }
#else #else
buf_sz=tot_sz; buf_sz=tot_sz;
buf=(LM_REAL *)malloc(buf_sz*sizeof(LM_REAL)); buf=(void *))malloc(buf_sz);
if(!buf){ if(!buf){
fprintf(stderr, RCAT("memory allocation in ", AX_EQ_B_LU) "() failed!\n"); fprintf(stderr, RCAT("memory allocation in ", AX_EQ_B_LU) "() failed!\n");
exit(1); exit(1);
...@@ -905,7 +945,6 @@ LM_REAL *a, *b, *work; ...@@ -905,7 +945,6 @@ LM_REAL *a, *b, *work;
ipiv=(int *)buf; ipiv=(int *)buf;
a=(LM_REAL *)(ipiv + ipiv_sz); a=(LM_REAL *)(ipiv + ipiv_sz);
b=a+a_sz; b=a+a_sz;
work=b+b_sz;
/* store A (column major!) into a and B into b */ /* store A (column major!) into a and B into b */
for(i=0; i<m; i++){ for(i=0; i<m; i++){
......
The levmar v2.3 library has been included in this package untouched, except for The levmar v2.4 library has been included in this package untouched, except for
three warnings removed, LU decomposition replaced with calls to ATLAS-Lapack three warnings removed, LU decomposition replaced with calls to ATLAS-Lapack
routines, Hessian matrix inversion done with SVD, and an AutoMakefile added. routines, Hessian matrix inversion done with SVD, and an AutoMakefile added.
Emmanuel Bertin <bertin@iap.fr> Emmanuel Bertin <bertin@iap.fr>
************************************************************** **************************************************************
LEVMAR LEVMAR
version 2.3 version 2.4
By Manolis Lourakis By Manolis Lourakis
Institute of Computer Science Institute of Computer Science
...@@ -28,7 +28,7 @@ It is strongly recommended that you *do* employ LAPACK; if you don't have it alr ...@@ -28,7 +28,7 @@ It is strongly recommended that you *do* employ LAPACK; if you don't have it alr
I suggest getting clapack from http://www.netlib.org/clapack. However, LAPACK's I suggest getting clapack from http://www.netlib.org/clapack. However, LAPACK's
use is not mandatory and the 2nd option makes levmar totally self-contained. use is not mandatory and the 2nd option makes levmar totally self-contained.
See lmdemo.c for examples of use and http://www.ics.forth.gr/~lourakis/levmar See lmdemo.c for examples of use and http://www.ics.forth.gr/~lourakis/levmar
for general comments. for general comments. An example of using levmar for data fitting is in expfit.c
The mathematical theory behind levmar is described in the lecture notes entitled The mathematical theory behind levmar is described in the lecture notes entitled
"Methods for Non-Linear Least Squares Problems", by K. Madsen, H.B. Nielsen and O. Tingleff, "Methods for Non-Linear Least Squares Problems", by K. Madsen, H.B. Nielsen and O. Tingleff,
...@@ -52,6 +52,10 @@ COMPILATION ...@@ -52,6 +52,10 @@ COMPILATION
- Under Windows and if Visual C is installed & configured for command line - Under Windows and if Visual C is installed & configured for command line
use, type "nmake /f Makefile.vc" in a cmd window to build levmar and the use, type "nmake /f Makefile.vc" in a cmd window to build levmar and the
demo program. In case of trouble, read the comments on top of Makefile.vc demo program. In case of trouble, read the comments on top of Makefile.vc
Visual C++ project files (levmar.vcproj and lmdemo.vcproj) are also included,
however they are not supported and are only meant to serve as a starting point
for creating your own. Check http://www.arstdesign.com/articles/prjconverter.html
if you need to convert to .dsw/.dsp (i.e., Visual C++ 6.0) project files.
- levmar can also be built under various platforms using the CMake cross-platform - levmar can also be built under various platforms using the CMake cross-platform
build system. The included CMakeLists.txt file can be used to generate makefiles build system. The included CMakeLists.txt file can be used to generate makefiles
......
...@@ -27,7 +27,7 @@ ...@@ -27,7 +27,7 @@
/* specify whether to use LAPACK or not. The first option is strongly recommended */ /* specify whether to use LAPACK or not. The first option is strongly recommended */
#define HAVE_LAPACK /* use LAPACK */ #define HAVE_LAPACK /* use LAPACK */
#undef HAVE_LAPACK /* uncomment this to force not using LAPACK */ #undef HAVE_LAPACK */ /* uncomment this to force not using LAPACK */
/* to avoid the overhead of repeated mallocs(), routines in Axb.c can be instructed to /* to avoid the overhead of repeated mallocs(), routines in Axb.c can be instructed to
* retain working memory between calls. Such a choice, however, renders these routines * retain working memory between calls. Such a choice, however, renders these routines
...@@ -79,12 +79,12 @@ extern "C" { ...@@ -79,12 +79,12 @@ extern "C" {
#define LM_BLEC_DIF_WORKSZ(npar, nmeas, nconstr) LM_LEC_DIF_WORKSZ((npar), (nmeas)+(npar), (nconstr)) #define LM_BLEC_DIF_WORKSZ(npar, nmeas, nconstr) LM_LEC_DIF_WORKSZ((npar), (nmeas)+(npar), (nconstr))
#define LM_OPTS_SZ 5 /* max(4, 5) */ #define LM_OPTS_SZ 5 /* max(4, 5) */
#define LM_INFO_SZ 9 #define LM_INFO_SZ 10
#define LM_ERROR -1 #define LM_ERROR -1
#define LM_INIT_MU 1E-03 #define LM_INIT_MU 1E-03
#define LM_STOP_THRESH 1E-17 #define LM_STOP_THRESH 1E-17
#define LM_DIFF_DELTA 1E-06 #define LM_DIFF_DELTA 1E-06
#define LM_VERSION "2.3 (May 2008)" #define LM_VERSION "2.4 (April 2009)"
#ifdef LM_DBL_PREC #ifdef LM_DBL_PREC
/* double precision LM, with & without Jacobian */ /* double precision LM, with & without Jacobian */
...@@ -241,15 +241,18 @@ extern void slevmar_chkjac( ...@@ -241,15 +241,18 @@ extern void slevmar_chkjac(
float *p, int m, int n, void *adata, float *err); float *p, int m, int n, void *adata, float *err);
#endif /* LM_SNGL_PREC */ #endif /* LM_SNGL_PREC */
/* standard deviation & Pearson's correlation coefficient for best-fit parameters */ /* standard deviation, coefficient of determination (R2) & Pearson's correlation coefficient for best-fit parameters */
#ifdef LM_DBL_PREC #ifdef LM_DBL_PREC
extern double dlevmar_stddev( double *covar, int m, int i); extern double dlevmar_stddev( double *covar, int m, int i);
extern double dlevmar_corcoef(double *covar, int m, int i, int j); extern double dlevmar_corcoef(double *covar, int m, int i, int j);
extern double dlevmar_R2(void (*func)(double *p, double *hx, int m, int n, void *adata), double *p, double *x, int m, int n, void *adata);
#endif /* LM_DBL_PREC */ #endif /* LM_DBL_PREC */
#ifdef LM_SNGL_PREC #ifdef LM_SNGL_PREC
extern float slevmar_stddev( float *covar, int m, int i); extern float slevmar_stddev( float *covar, int m, int i);
extern float slevmar_corcoef(float *covar, int m, int i, int j); extern float slevmar_corcoef(float *covar, int m, int i, int j);
extern float slevmar_R2(void (*func)(float *p, float *hx, int m, int n, void *adata), float *p, float *x, int m, int n, void *adata);
#endif /* LM_SNGL_PREC */ #endif /* LM_SNGL_PREC */
#ifdef __cplusplus #ifdef __cplusplus
......
...@@ -50,7 +50,7 @@ ...@@ -50,7 +50,7 @@
* This function requires an analytic Jacobian. In case the latter is unavailable, * This function requires an analytic Jacobian. In case the latter is unavailable,
* use LEVMAR_DIF() bellow * use LEVMAR_DIF() bellow
* *
* Returns the number of iterations (>=0) if successfull, LM_ERROR if failed * Returns the number of iterations (>=0) if successful, LM_ERROR if failed
* *
* For more details, see K. Madsen, H.B. Nielsen and O. Tingleff's lecture notes on * For more details, see K. Madsen, H.B. Nielsen and O. Tingleff's lecture notes on
* non-linear least squares at http://www.imm.dtu.dk/pubdb/views/edoc_download.php/3215/pdf/imm3215.pdf * non-linear least squares at http://www.imm.dtu.dk/pubdb/views/edoc_download.php/3215/pdf/imm3215.pdf
...@@ -81,6 +81,7 @@ int LEVMAR_DER( ...@@ -81,6 +81,7 @@ int LEVMAR_DER(
* 7 - stopped by invalid (i.e. NaN or Inf) "func" values. This is a user error * 7 - stopped by invalid (i.e. NaN or Inf) "func" values. This is a user error
* info[7]= # function evaluations * info[7]= # function evaluations
* info[8]= # Jacobian evaluations * info[8]= # Jacobian evaluations
* info[9]= # linear systems solved, i.e. # attempts for reducing error
*/ */
LM_REAL *work, /* working memory at least LM_DER_WORKSZ() reals large, allocated if NULL */ LM_REAL *work, /* working memory at least LM_DER_WORKSZ() reals large, allocated if NULL */
LM_REAL *covar, /* O: Covariance matrix corresponding to LS solution; mxm. Set to NULL if not needed. */ LM_REAL *covar, /* O: Covariance matrix corresponding to LS solution; mxm. Set to NULL if not needed. */
...@@ -106,7 +107,7 @@ LM_REAL p_eL2, jacTe_inf, pDp_eL2; /* ||e(p)||_2, ||J^T e||_inf, ||e(p+Dp)||_2 * ...@@ -106,7 +107,7 @@ LM_REAL p_eL2, jacTe_inf, pDp_eL2; /* ||e(p)||_2, ||J^T e||_inf, ||e(p+Dp)||_2 *
LM_REAL p_L2, Dp_L2=LM_REAL_MAX, dF, dL; LM_REAL p_L2, Dp_L2=LM_REAL_MAX, dF, dL;
LM_REAL tau, eps1, eps2, eps2_sq, eps3; LM_REAL tau, eps1, eps2, eps2_sq, eps3;
LM_REAL init_p_eL2; LM_REAL init_p_eL2;
int nu=2, nu2, stop=0, nfev, njev=0; int nu=2, nu2, stop=0, nfev, njev=0, nlss=0;
const int nm=n*m; const int nm=n*m;
int (*linsolver)(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m)=NULL; int (*linsolver)(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m)=NULL;
...@@ -143,7 +144,7 @@ int (*linsolver)(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m)=NULL; ...@@ -143,7 +144,7 @@ int (*linsolver)(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m)=NULL;
work=(LM_REAL *)malloc(worksz*sizeof(LM_REAL)); /* allocate a big chunk in one step */ work=(LM_REAL *)malloc(worksz*sizeof(LM_REAL)); /* allocate a big chunk in one step */
if(!work){ if(!work){
fprintf(stderr, LCAT(LEVMAR_DER, "(): memory allocation request failed\n")); fprintf(stderr, LCAT(LEVMAR_DER, "(): memory allocation request failed\n"));
exit(1); return LM_ERROR;
} }
freework=1; freework=1;
} }
...@@ -181,7 +182,7 @@ int (*linsolver)(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m)=NULL; ...@@ -181,7 +182,7 @@ int (*linsolver)(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m)=NULL;
} }
/* Compute the Jacobian J at p, J^T J, J^T e, ||J^T e||_inf and ||p||^2. /* Compute the Jacobian J at p, J^T J, J^T e, ||J^T e||_inf and ||p||^2.
* Since J^T J is symmetric, its computation can be speeded up by computing * Since J^T J is symmetric, its computation can be sped up by computing
* only its upper triangular part and copying it to the lower part * only its upper triangular part and copying it to the lower part
*/ */
...@@ -189,38 +190,51 @@ int (*linsolver)(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m)=NULL; ...@@ -189,38 +190,51 @@ int (*linsolver)(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m)=NULL;
/* J^T J, J^T e */ /* J^T J, J^T e */
if(nm<__BLOCKSZ__SQ){ // this is a small problem if(nm<__BLOCKSZ__SQ){ // this is a small problem
/* This is the straightforward way to compute J^T J, J^T e. However, due to /* J^T*J_ij = \sum_l J^T_il * J_lj = \sum_l J_li * J_lj.
* its noncontinuous memory access pattern, it incures many cache misses when * Thus, the product J^T J can be computed using an outer loop for
* applied to large minimization problems (i.e. problems involving a large * l that adds J_li*J_lj to each element ij of the result. Note that
* number of free variables and measurements), in which J is too large to * with this scheme, the accesses to J and JtJ are always along rows,
* fit in the L1 cache. For such problems, a cache-efficient blocking scheme * therefore induces less cache misses compared to the straightforward
* is preferable. * algorithm for computing the product (i.e., l loop is innermost one).
* A similar scheme applies to the computation of J^T e.
* However, for large minimization problems (i.e., involving a large number
* of unknowns and measurements) for which J/J^T J rows are too large to
* fit in the L1 cache, even this scheme incures many cache misses. In
* such cases, a cache-efficient blocking scheme is preferable.
* *
* Thanks to John Nitao of Lawrence Livermore Lab for pointing out this * Thanks to John Nitao of Lawrence Livermore Lab for pointing out this
* performance problem. * performance problem.
* *
* On the other hand, the straightforward algorithm is faster on small * Note that the non-blocking algorithm is faster on small
* problems since in this case it avoids the overheads of blocking. * problems since in this case it avoids the overheads of blocking.
*/ */
for(i=0; i<m; ++i){ /* looping downwards saves a few computations */
for(j=i; j<m; ++j){ register int l, im;
int lm; register LM_REAL alpha, *jaclm;
for(l=0, tmp=0.0; l<n; ++l){ for(i=m*m; i-->0; )
lm=l*m; jacTjac[i]=0.0;
tmp+=jac[lm+i]*jac[lm+j]; for(i=m; i-->0; )
} jacTe[i]=0.0;
/* store tmp in the corresponding upper and lower part elements */ for(l=n; l-->0; ){
jacTjac[i*m+j]=jacTjac[j*m+i]=tmp; jaclm=jac+l*m;
} for(i=m; i-->0; ){
im=i*m;
alpha=jaclm[i]; //jac[l*m+i];
for(j=i+1; j-->0; ) /* j<=i computes lower triangular part only */
jacTjac[im+j]+=jaclm[j]*alpha; //jac[l*m+j]
/* J^T e */ /* J^T e */
for(l=0, tmp=0.0; l<n; ++l) jacTe[i]+=alpha*e[l];
tmp+=jac[l*m+i]*e[l]; }
jacTe[i]=tmp;
} }
for(i=m; i-->0; ) /* copy to upper part */
for(j=i+1; j<m; ++j)
jacTjac[i*m+j]=jacTjac[j*m+i];
} }
else{ // this is a large problem else{ // this is a large problem
/* Cache efficient computation of J^T J based on blocking /* Cache efficient computation of J^T J based on blocking
...@@ -284,15 +298,15 @@ if(!(k%100)){ ...@@ -284,15 +298,15 @@ if(!(k%100)){
* SVD is the slowest but most accurate; LU offers a tradeoff between accuracy and speed * SVD is the slowest but most accurate; LU offers a tradeoff between accuracy and speed
*/ */
issolved=AX_EQ_B_LU(jacTjac, jacTe, Dp, m); linsolver=AX_EQ_B_LU; issolved=AX_EQ_B_LU(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_LU;
//issolved=AX_EQ_B_CHOL(jacTjac, jacTe, Dp, m); linsolver=AX_EQ_B_CHOL; //issolved=AX_EQ_B_CHOL(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_CHOL;
//issolved=AX_EQ_B_QR(jacTjac, jacTe, Dp, m); linsolver=AX_EQ_B_QR; //issolved=AX_EQ_B_QR(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_QR;
//issolved=AX_EQ_B_QRLS(jacTjac, jacTe, Dp, m, m); linsolver=AX_EQ_B_QRLS; //issolved=AX_EQ_B_QRLS(jacTjac, jacTe, Dp, m, m); ++nlss; linsolver=(int (*)(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m))AX_EQ_B_QRLS;
//issolved=AX_EQ_B_SVD(jacTjac, jacTe, Dp, m); linsolver=AX_EQ_B_SVD; //issolved=AX_EQ_B_SVD(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_SVD;
#else #else
/* use the LU included with levmar */ /* use the LU included with levmar */
issolved=AX_EQ_B_LU(jacTjac, jacTe, Dp, m); linsolver=AX_EQ_B_LU; issolved=AX_EQ_B_LU(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_LU;
#endif /* HAVE_LAPACK */ #endif /* HAVE_LAPACK */
if(issolved){ if(issolved){
...@@ -389,6 +403,7 @@ if(!(k%100)){ ...@@ -389,6 +403,7 @@ if(!(k%100)){
info[6]=(LM_REAL)stop; info[6]=(LM_REAL)stop;
info[7]=(LM_REAL)nfev; info[7]=(LM_REAL)nfev;
info[8]=(LM_REAL)njev; info[8]=(LM_REAL)njev;
info[9]=(LM_REAL)nlss;
} }
/* covariance matrix */ /* covariance matrix */
...@@ -436,6 +451,7 @@ int LEVMAR_DIF( ...@@ -436,6 +451,7 @@ int LEVMAR_DIF(
* 7 - stopped by invalid (i.e. NaN or Inf) "func" values. This is a user error * 7 - stopped by invalid (i.e. NaN or Inf) "func" values. This is a user error
* info[7]= # function evaluations * info[7]= # function evaluations
* info[8]= # Jacobian evaluations * info[8]= # Jacobian evaluations
* info[9]= # linear systems solved, i.e. # attempts for reducing error
*/ */
LM_REAL *work, /* working memory at least LM_DIF_WORKSZ() reals large, allocated if NULL */ LM_REAL *work, /* working memory at least LM_DIF_WORKSZ() reals large, allocated if NULL */
LM_REAL *covar, /* O: Covariance matrix corresponding to LS solution; mxm. Set to NULL if not needed. */ LM_REAL *covar, /* O: Covariance matrix corresponding to LS solution; mxm. Set to NULL if not needed. */
...@@ -465,7 +481,7 @@ LM_REAL p_eL2, jacTe_inf, pDp_eL2; /* ||e(p)||_2, ||J^T e||_inf, ||e(p+Dp)||_2 * ...@@ -465,7 +481,7 @@ LM_REAL p_eL2, jacTe_inf, pDp_eL2; /* ||e(p)||_2, ||J^T e||_inf, ||e(p+Dp)||_2 *
LM_REAL p_L2, Dp_L2=LM_REAL_MAX, dF, dL; LM_REAL p_L2, Dp_L2=LM_REAL_MAX, dF, dL;
LM_REAL tau, eps1, eps2, eps2_sq, eps3, delta; LM_REAL tau, eps1, eps2, eps2_sq, eps3, delta;
LM_REAL init_p_eL2; LM_REAL init_p_eL2;
int nu, nu2, stop=0, nfev, njap=0, K=(m>=10)? m: 10, updjac, updp=1, newjac; int nu, nu2, stop=0, nfev, njap=0, nlss=0, K=(m>=10)? m: 10, updjac, updp=1, newjac;
const int nm=n*m; const int nm=n*m;
int (*linsolver)(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m)=NULL; int (*linsolver)(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m)=NULL;
...@@ -503,7 +519,7 @@ int (*linsolver)(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m)=NULL; ...@@ -503,7 +519,7 @@ int (*linsolver)(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m)=NULL;
work=(LM_REAL *)malloc(worksz*sizeof(LM_REAL)); /* allocate a big chunk in one step */ work=(LM_REAL *)malloc(worksz*sizeof(LM_REAL)); /* allocate a big chunk in one step */
if(!work){ if(!work){
fprintf(stderr, LCAT(LEVMAR_DIF, "(): memory allocation request failed\n")); fprintf(stderr, LCAT(LEVMAR_DIF, "(): memory allocation request failed\n"));
exit(1); return LM_ERROR;
} }
freework=1; freework=1;
} }
...@@ -565,37 +581,49 @@ int (*linsolver)(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m)=NULL; ...@@ -565,37 +581,49 @@ int (*linsolver)(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m)=NULL;
/* J^T J, J^T e */ /* J^T J, J^T e */
if(nm<=__BLOCKSZ__SQ){ // this is a small problem if(nm<=__BLOCKSZ__SQ){ // this is a small problem
/* This is the straightforward way to compute J^T J, J^T e. However, due to /* J^T*J_ij = \sum_l J^T_il * J_lj = \sum_l J_li * J_lj.
* its noncontinuous memory access pattern, it incures many cache misses when * Thus, the product J^T J can be computed using an outer loop for
* applied to large minimization problems (i.e. problems involving a large * l that adds J_li*J_lj to each element ij of the result. Note that
* number of free variables and measurements), in which J is too large to * with this scheme, the accesses to J and JtJ are always along rows,
* fit in the L1 cache. For such problems, a cache-efficient blocking scheme * therefore induces less cache misses compared to the straightforward
* is preferable. * algorithm for computing the product (i.e., l loop is innermost one).
* A similar scheme applies to the computation of J^T e.
* However, for large minimization problems (i.e., involving a large number
* of unknowns and measurements) for which J/J^T J rows are too large to
* fit in the L1 cache, even this scheme incures many cache misses. In
* such cases, a cache-efficient blocking scheme is preferable.
* *
* Thanks to John Nitao of Lawrence Livermore Lab for pointing out this * Thanks to John Nitao of Lawrence Livermore Lab for pointing out this
* performance problem. * performance problem.
* *
* On the other hand, the straightforward algorithm is faster on small * Note that the non-blocking algorithm is faster on small
* problems since in this case it avoids the overheads of blocking. * problems since in this case it avoids the overheads of blocking.
*/ */
register int l, im;
for(i=0; i<m; ++i){ register LM_REAL alpha, *jaclm;
for(j=i; j<m; ++j){
int lm;
for(l=0, tmp=0.0; l<n; ++l){ /* looping downwards saves a few computations */
lm=l*m; for(i=m*m; i-->0; )
tmp+=jac[lm+i]*jac[lm+j]; jacTjac[i]=0.0;
} for(i=m; i-->0; )
jacTe[i]=0.0;
jacTjac[i*m+j]=jacTjac[j*m+i]=tmp; for(l=n; l-->0; ){
} jaclm=jac+l*m;
for(i=m; i-->0; ){
im=i*m;
alpha=jaclm[i]; //jac[l*m+i];
for(j=i+1; j-->0; ) /* j<=i computes lower triangular part only */
jacTjac[im+j]+=jaclm[j]*alpha; //jac[l*m+j]
/* J^T e */ /* J^T e */
for(l=0, tmp=0.0; l<n; ++l) jacTe[i]+=alpha*e[l];
tmp+=jac[l*m+i]*e[l]; }
jacTe[i]=tmp;
} }
for(i=m; i-->0; ) /* copy to upper part */
for(j=i+1; j<m; ++j)
jacTjac[i*m+j]=jacTjac[j*m+i];
} }
else{ // this is a large problem else{ // this is a large problem
/* Cache efficient computation of J^T J based on blocking /* Cache efficient computation of J^T J based on blocking
...@@ -660,14 +688,14 @@ if(!(k%100)){ ...@@ -660,14 +688,14 @@ if(!(k%100)){
* SVD is the slowest but most accurate; LU offers a tradeoff between accuracy and speed * SVD is the slowest but most accurate; LU offers a tradeoff between accuracy and speed
*/ */
issolved=AX_EQ_B_LU(jacTjac, jacTe, Dp, m); linsolver=AX_EQ_B_LU; issolved=AX_EQ_B_LU(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_LU;
//issolved=AX_EQ_B_CHOL(jacTjac, jacTe, Dp, m); linsolver=AX_EQ_B_CHOL; //issolved=AX_EQ_B_CHOL(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_CHOL;
//issolved=AX_EQ_B_QR(jacTjac, jacTe, Dp, m); linsolver=AX_EQ_B_QR; //issolved=AX_EQ_B_QR(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_QR;
//issolved=AX_EQ_B_QRLS(jacTjac, jacTe, Dp, m, m); linsolver=AX_EQ_B_QRLS; //issolved=AX_EQ_B_QRLS(jacTjac, jacTe, Dp, m, m); ++nlss; linsolver=(int (*)(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m))AX_EQ_B_QRLS;
//issolved=AX_EQ_B_SVD(jacTjac, jacTe, Dp, m); linsolver=AX_EQ_B_SVD; //issolved=AX_EQ_B_SVD(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_SVD;
#else #else
/* use the LU included with levmar */ /* use the LU included with levmar */
issolved=AX_EQ_B_LU(jacTjac, jacTe, Dp, m); linsolver=AX_EQ_B_LU; issolved=AX_EQ_B_LU(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_LU;
#endif /* HAVE_LAPACK */ #endif /* HAVE_LAPACK */
if(issolved){ if(issolved){
...@@ -778,6 +806,7 @@ if(!(k%100)){ ...@@ -778,6 +806,7 @@ if(!(k%100)){
info[6]=(LM_REAL)stop; info[6]=(LM_REAL)stop;
info[7]=(LM_REAL)nfev; info[7]=(LM_REAL)nfev;
info[8]=(LM_REAL)njap; info[8]=(LM_REAL)njap;
info[9]=(LM_REAL)nlss;
} }
/* covariance matrix */ /* covariance matrix */
......
...@@ -28,7 +28,7 @@ ...@@ -28,7 +28,7 @@
#define BOXPROJECT LM_ADD_PREFIX(boxProject) #define BOXPROJECT LM_ADD_PREFIX(boxProject)
#define LEVMAR_BOX_CHECK LM_ADD_PREFIX(levmar_box_check) #define LEVMAR_BOX_CHECK LM_ADD_PREFIX(levmar_box_check)
#define LEVMAR_BC_DER LM_ADD_PREFIX(levmar_bc_der) #define LEVMAR_BC_DER LM_ADD_PREFIX(levmar_bc_der)
#define LEVMAR_BC_DIF LM_ADD_PREFIX(levmar_bc_dif) //CHECKME #define LEVMAR_BC_DIF LM_ADD_PREFIX(levmar_bc_dif)
#define LEVMAR_FDIF_FORW_JAC_APPROX LM_ADD_PREFIX(levmar_fdif_forw_jac_approx) #define LEVMAR_FDIF_FORW_JAC_APPROX LM_ADD_PREFIX(levmar_fdif_forw_jac_approx)
#define LEVMAR_FDIF_CENT_JAC_APPROX LM_ADD_PREFIX(levmar_fdif_cent_jac_approx) #define LEVMAR_FDIF_CENT_JAC_APPROX LM_ADD_PREFIX(levmar_fdif_cent_jac_approx)
#define LEVMAR_TRANS_MAT_MAT_MULT LM_ADD_PREFIX(levmar_trans_mat_mat_mult) #define LEVMAR_TRANS_MAT_MAT_MULT LM_ADD_PREFIX(levmar_trans_mat_mat_mult)
...@@ -264,7 +264,7 @@ register int i; ...@@ -264,7 +264,7 @@ register int i;
* This function requires an analytic Jacobian. In case the latter is unavailable, * This function requires an analytic Jacobian. In case the latter is unavailable,
* use LEVMAR_BC_DIF() bellow * use LEVMAR_BC_DIF() bellow
* *
* Returns the number of iterations (>=0) if successfull, LM_ERROR if failed * Returns the number of iterations (>=0) if successful, LM_ERROR if failed
* *
* For details, see C. Kanzow, N. Yamashita and M. Fukushima: "Levenberg-Marquardt * For details, see C. Kanzow, N. Yamashita and M. Fukushima: "Levenberg-Marquardt
* methods for constrained nonlinear equations with strong local convergence properties", * methods for constrained nonlinear equations with strong local convergence properties",
...@@ -301,6 +301,7 @@ int LEVMAR_BC_DER( ...@@ -301,6 +301,7 @@ int LEVMAR_BC_DER(
* 7 - stopped by invalid (i.e. NaN or Inf) "func" values. This is a user error * 7 - stopped by invalid (i.e. NaN or Inf) "func" values. This is a user error
* info[7]= # function evaluations * info[7]= # function evaluations
* info[8]= # Jacobian evaluations * info[8]= # Jacobian evaluations
* info[9]= # linear systems solved, i.e. # attempts for reducing error
*/ */
LM_REAL *work, /* working memory at least LM_BC_DER_WORKSZ() reals large, allocated if NULL */ LM_REAL *work, /* working memory at least LM_BC_DER_WORKSZ() reals large, allocated if NULL */
LM_REAL *covar, /* O: Covariance matrix corresponding to LS solution; mxm. Set to NULL if not needed. */ LM_REAL *covar, /* O: Covariance matrix corresponding to LS solution; mxm. Set to NULL if not needed. */
...@@ -326,7 +327,7 @@ LM_REAL p_eL2, jacTe_inf, pDp_eL2; /* ||e(p)||_2, ||J^T e||_inf, ||e(p+Dp)||_2 * ...@@ -326,7 +327,7 @@ LM_REAL p_eL2, jacTe_inf, pDp_eL2; /* ||e(p)||_2, ||J^T e||_inf, ||e(p+Dp)||_2 *
LM_REAL p_L2, Dp_L2=LM_REAL_MAX, dF, dL; LM_REAL p_L2, Dp_L2=LM_REAL_MAX, dF, dL;
LM_REAL tau, eps1, eps2, eps2_sq, eps3; LM_REAL tau, eps1, eps2, eps2_sq, eps3;
LM_REAL init_p_eL2; LM_REAL init_p_eL2;
int nu=2, nu2, stop=0, nfev, njev=0; int nu=2, nu2, stop=0, nfev, njev=0, nlss=0;
const int nm=n*m; const int nm=n*m;
/* variables for constrained LM */ /* variables for constrained LM */
...@@ -378,7 +379,7 @@ int (*linsolver)(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m)=NULL; ...@@ -378,7 +379,7 @@ int (*linsolver)(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m)=NULL;
work=(LM_REAL *)malloc(worksz*sizeof(LM_REAL)); /* allocate a big chunk in one step */ work=(LM_REAL *)malloc(worksz*sizeof(LM_REAL)); /* allocate a big chunk in one step */
if(!work){ if(!work){
fprintf(stderr, LCAT(LEVMAR_BC_DER, "(): memory allocation request failed\n")); fprintf(stderr, LCAT(LEVMAR_BC_DER, "(): memory allocation request failed\n"));
exit(1); return LM_ERROR;
} }
freework=1; freework=1;
} }
...@@ -431,7 +432,7 @@ int (*linsolver)(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m)=NULL; ...@@ -431,7 +432,7 @@ int (*linsolver)(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m)=NULL;
} }
/* Compute the Jacobian J at p, J^T J, J^T e, ||J^T e||_inf and ||p||^2. /* Compute the Jacobian J at p, J^T J, J^T e, ||J^T e||_inf and ||p||^2.
* Since J^T J is symmetric, its computation can be speeded up by computing * Since J^T J is symmetric, its computation can be sped up by computing
* only its upper triangular part and copying it to the lower part * only its upper triangular part and copying it to the lower part
*/ */
...@@ -439,38 +440,49 @@ int (*linsolver)(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m)=NULL; ...@@ -439,38 +440,49 @@ int (*linsolver)(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m)=NULL;
/* J^T J, J^T e */ /* J^T J, J^T e */
if(nm<__BLOCKSZ__SQ){ // this is a small problem if(nm<__BLOCKSZ__SQ){ // this is a small problem
/* This is the straightforward way to compute J^T J, J^T e. However, due to /* J^T*J_ij = \sum_l J^T_il * J_lj = \sum_l J_li * J_lj.
* its noncontinuous memory access pattern, it incures many cache misses when * Thus, the product J^T J can be computed using an outer loop for
* applied to large minimization problems (i.e. problems involving a large * l that adds J_li*J_lj to each element ij of the result. Note that
* number of free variables and measurements), in which J is too large to * with this scheme, the accesses to J and JtJ are always along rows,
* fit in the L1 cache. For such problems, a cache-efficient blocking scheme * therefore induces less cache misses compared to the straightforward
* is preferable. * algorithm for computing the product (i.e., l loop is innermost one).
* A similar scheme applies to the computation of J^T e.
* However, for large minimization problems (i.e., involving a large number
* of unknowns and measurements) for which J/J^T J rows are too large to
* fit in the L1 cache, even this scheme incures many cache misses. In
* such cases, a cache-efficient blocking scheme is preferable.
* *
* Thanks to John Nitao of Lawrence Livermore Lab for pointing out this * Thanks to John Nitao of Lawrence Livermore Lab for pointing out this
* performance problem. * performance problem.
* *
* On the other hand, the straightforward algorithm is faster on small * Note that the non-blocking algorithm is faster on small
* problems since in this case it avoids the overheads of blocking. * problems since in this case it avoids the overheads of blocking.
*/ */
register int l, im;
register LM_REAL alpha, *jaclm;
for(i=0; i<m; ++i){ /* looping downwards saves a few computations */
for(j=i; j<m; ++j){ for(i=m*m; i-->0; )
int lm; jacTjac[i]=0.0;
for(i=m; i-->0; )
jacTe[i]=0.0;
for(l=0, tmp=0.0; l<n; ++l){ for(l=n; l-->0; ){
lm=l*m; jaclm=jac+l*m;
tmp+=jac[lm+i]*jac[lm+j]; for(i=m; i-->0; ){
} im=i*m;
alpha=jaclm[i]; //jac[l*m+i];
for(j=i+1; j-->0; ) /* j<=i computes lower triangular part only */
jacTjac[im+j]+=jaclm[j]*alpha; //jac[l*m+j]
/* store tmp in the corresponding upper and lower part elements */ /* J^T e */
jacTjac[i*m+j]=jacTjac[j*m+i]=tmp; jacTe[i]+=alpha*e[l];
} }
/* J^T e */
for(l=0, tmp=0.0; l<n; ++l)
tmp+=jac[l*m+i]*e[l];
jacTe[i]=tmp;
} }
for(i=m; i-->0; ) /* copy to upper part */
for(j=i+1; j<m; ++j)
jacTjac[i*m+j]=jacTjac[j*m+i];
} }
else{ // this is a large problem else{ // this is a large problem
/* Cache efficient computation of J^T J based on blocking /* Cache efficient computation of J^T J based on blocking
...@@ -544,15 +556,15 @@ if(!(k%100)){ ...@@ -544,15 +556,15 @@ if(!(k%100)){
* SVD is the slowest but most accurate; LU offers a tradeoff between accuracy and speed * SVD is the slowest but most accurate; LU offers a tradeoff between accuracy and speed
*/ */
issolved=AX_EQ_B_LU(jacTjac, jacTe, Dp, m); linsolver=AX_EQ_B_LU; issolved=AX_EQ_B_LU(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_LU;
//issolved=AX_EQ_B_CHOL(jacTjac, jacTe, Dp, m); linsolver=AX_EQ_B_CHOL; //issolved=AX_EQ_B_CHOL(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_CHOL;
//issolved=AX_EQ_B_QR(jacTjac, jacTe, Dp, m); linsolver=AX_EQ_B_QR; //issolved=AX_EQ_B_QR(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_QR;
//issolved=AX_EQ_B_QRLS(jacTjac, jacTe, Dp, m, m); linsolver=AX_EQ_B_QRLS; //issolved=AX_EQ_B_QRLS(jacTjac, jacTe, Dp, m, m); ++nlss; linsolver=(int (*)(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m))AX_EQ_B_QRLS;
//issolved=AX_EQ_B_SVD(jacTjac, jacTe, Dp, m); linsolver=AX_EQ_B_SVD; //issolved=AX_EQ_B_SVD(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_SVD;
#else #else
/* use the LU included with levmar */ /* use the LU included with levmar */
issolved=AX_EQ_B_LU(jacTjac, jacTe, Dp, m); linsolver=AX_EQ_B_LU; issolved=AX_EQ_B_LU(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_LU;
#endif /* HAVE_LAPACK */ #endif /* HAVE_LAPACK */
if(issolved){ if(issolved){
...@@ -784,6 +796,7 @@ breaknested: /* NOTE: this point is also reached via an explicit goto! */ ...@@ -784,6 +796,7 @@ breaknested: /* NOTE: this point is also reached via an explicit goto! */
info[6]=(LM_REAL)stop; info[6]=(LM_REAL)stop;
info[7]=(LM_REAL)nfev; info[7]=(LM_REAL)nfev;
info[8]=(LM_REAL)njev; info[8]=(LM_REAL)njev;
info[9]=(LM_REAL)nlss;
} }
/* covariance matrix */ /* covariance matrix */
...@@ -808,13 +821,14 @@ printf("%d LM steps, %d line search, %d projected gradient\n", nLMsteps, nLSstep ...@@ -808,13 +821,14 @@ printf("%d LM steps, %d line search, %d projected gradient\n", nLMsteps, nLSstep
* version of LEVMAR_BC_DIF() is implemented... * version of LEVMAR_BC_DIF() is implemented...
*/ */
struct LMBC_DIF_DATA{ struct LMBC_DIF_DATA{
int ffdif; // nonzero if forward differencing is used
void (*func)(LM_REAL *p, LM_REAL *hx, int m, int n, void *adata); void (*func)(LM_REAL *p, LM_REAL *hx, int m, int n, void *adata);
LM_REAL *hx, *hxx; LM_REAL *hx, *hxx;
void *adata; void *adata;
LM_REAL delta; LM_REAL delta;
}; };
void LMBC_DIF_FUNC(LM_REAL *p, LM_REAL *hx, int m, int n, void *data) static void LMBC_DIF_FUNC(LM_REAL *p, LM_REAL *hx, int m, int n, void *data)
{ {
struct LMBC_DIF_DATA *dta=(struct LMBC_DIF_DATA *)data; struct LMBC_DIF_DATA *dta=(struct LMBC_DIF_DATA *)data;
...@@ -822,13 +836,17 @@ struct LMBC_DIF_DATA *dta=(struct LMBC_DIF_DATA *)data; ...@@ -822,13 +836,17 @@ struct LMBC_DIF_DATA *dta=(struct LMBC_DIF_DATA *)data;
(*(dta->func))(p, hx, m, n, dta->adata); (*(dta->func))(p, hx, m, n, dta->adata);
} }
void LMBC_DIF_JACF(LM_REAL *p, LM_REAL *jac, int m, int n, void *data) static void LMBC_DIF_JACF(LM_REAL *p, LM_REAL *jac, int m, int n, void *data)
{ {
struct LMBC_DIF_DATA *dta=(struct LMBC_DIF_DATA *)data; struct LMBC_DIF_DATA *dta=(struct LMBC_DIF_DATA *)data;
/* evaluate user-supplied function at p */ if(dta->ffdif){
(*(dta->func))(p, dta->hx, m, n, dta->adata); /* evaluate user-supplied function at p */
LEVMAR_FDIF_FORW_JAC_APPROX(dta->func, p, dta->hx, dta->hxx, dta->delta, jac, m, n, dta->adata); (*(dta->func))(p, dta->hx, m, n, dta->adata);
LEVMAR_FDIF_FORW_JAC_APPROX(dta->func, p, dta->hx, dta->hxx, dta->delta, jac, m, n, dta->adata);
}
else
LEVMAR_FDIF_CENT_JAC_APPROX(dta->func, p, dta->hx, dta->hxx, dta->delta, jac, m, n, dta->adata);
} }
...@@ -866,6 +884,7 @@ int LEVMAR_BC_DIF( ...@@ -866,6 +884,7 @@ int LEVMAR_BC_DIF(
* 7 - stopped by invalid (i.e. NaN or Inf) "func" values. This is a user error * 7 - stopped by invalid (i.e. NaN or Inf) "func" values. This is a user error
* info[7]= # function evaluations * info[7]= # function evaluations
* info[8]= # Jacobian evaluations * info[8]= # Jacobian evaluations
* info[9]= # linear systems solved, i.e. # attempts for reducing error
*/ */
LM_REAL *work, /* working memory at least LM_BC_DIF_WORKSZ() reals large, allocated if NULL */ LM_REAL *work, /* working memory at least LM_BC_DIF_WORKSZ() reals large, allocated if NULL */
LM_REAL *covar, /* O: Covariance matrix corresponding to LS solution; mxm. Set to NULL if not needed. */ LM_REAL *covar, /* O: Covariance matrix corresponding to LS solution; mxm. Set to NULL if not needed. */
...@@ -876,27 +895,34 @@ int LEVMAR_BC_DIF( ...@@ -876,27 +895,34 @@ int LEVMAR_BC_DIF(
struct LMBC_DIF_DATA data; struct LMBC_DIF_DATA data;
int ret; int ret;
//fprintf(stderr, RCAT("\nWarning: current implementation of ", LEVMAR_BC_DIF) "() does not use a secant approach!\n\n"); //fprintf(stderr, RCAT("\nWarning: current implementation of ", LEVMAR_BC_DIF) "() does not use a secant approach!\n\n");
data.func=func; data.ffdif=!opts || opts[4]>=0.0;
data.hx=(LM_REAL *)malloc(2*n*sizeof(LM_REAL)); /* allocate a big chunk in one step */
if(!data.hx){
fprintf(stderr, LCAT(LEVMAR_BC_DIF, "(): memory allocation request failed\n"));
exit(1);
}
data.hxx=data.hx+n;
data.adata=adata;
data.delta=(opts)? FABS(opts[4]) : (LM_REAL)LM_DIFF_DELTA; // no central differences here...
ret=LEVMAR_BC_DER(LMBC_DIF_FUNC, LMBC_DIF_JACF, p, x, m, n, lb, ub, itmax, opts, info, work, covar, (void *)&data); data.func=func;
data.hx=(LM_REAL *)malloc(2*n*sizeof(LM_REAL)); /* allocate a big chunk in one step */
if(!data.hx){
fprintf(stderr, LCAT(LEVMAR_BC_DIF, "(): memory allocation request failed\n"));
return LM_ERROR;
}
data.hxx=data.hx+n;
data.adata=adata;
data.delta=(opts)? FABS(opts[4]) : (LM_REAL)LM_DIFF_DELTA;
ret=LEVMAR_BC_DER(LMBC_DIF_FUNC, LMBC_DIF_JACF, p, x, m, n, lb, ub, itmax, opts, info, work, covar, (void *)&data);
if(info) /* correct the number of function calls */ if(info){ /* correct the number of function calls */
if(data.ffdif)
info[7]+=info[8]*(m+1); /* each Jacobian evaluation costs m+1 function calls */ info[7]+=info[8]*(m+1); /* each Jacobian evaluation costs m+1 function calls */
else
info[7]+=info[8]*(2*m); /* each Jacobian evaluation costs 2*m function calls */
}
free(data.hx); free(data.hx);
return ret; return ret;
} }
/* undefine everything. THIS MUST REMAIN AT THE END OF THE FILE */ /* undefine everything. THIS MUST REMAIN AT THE END OF THE FILE */
#undef FUNC_STATE #undef FUNC_STATE
#undef LNSRCH #undef LNSRCH
......
...@@ -70,7 +70,6 @@ ...@@ -70,7 +70,6 @@
#define LEVMAR_BLEC_DER LM_ADD_PREFIX(levmar_blec_der) #define LEVMAR_BLEC_DER LM_ADD_PREFIX(levmar_blec_der)
#define LEVMAR_BLEC_DIF LM_ADD_PREFIX(levmar_blec_dif) #define LEVMAR_BLEC_DIF LM_ADD_PREFIX(levmar_blec_dif)
#define LEVMAR_COVAR LM_ADD_PREFIX(levmar_covar) #define LEVMAR_COVAR LM_ADD_PREFIX(levmar_covar)
#define LEVMAR_FDIF_FORW_JAC_APPROX LM_ADD_PREFIX(levmar_fdif_forw_jac_approx)
struct LMBLEC_DATA{ struct LMBLEC_DATA{
LM_REAL *x, *lb, *ub, *w; LM_REAL *x, *lb, *ub, *w;
...@@ -177,7 +176,7 @@ register LM_REAL *lb, *ub, *w, tmp; ...@@ -177,7 +176,7 @@ register LM_REAL *lb, *ub, *w, tmp;
* This function requires an analytic Jacobian. In case the latter is unavailable, * This function requires an analytic Jacobian. In case the latter is unavailable,
* use LEVMAR_BLEC_DIF() bellow * use LEVMAR_BLEC_DIF() bellow
* *
* Returns the number of iterations (>=0) if successfull, LM_ERROR if failed * Returns the number of iterations (>=0) if successful, LM_ERROR if failed
* *
* For more details on the algorithm implemented by this function, please refer to * For more details on the algorithm implemented by this function, please refer to
* the comments in the top of this file. * the comments in the top of this file.
...@@ -214,6 +213,7 @@ int LEVMAR_BLEC_DER( ...@@ -214,6 +213,7 @@ int LEVMAR_BLEC_DER(
* 7 - stopped by invalid (i.e. NaN or Inf) "func" values. This is a user error * 7 - stopped by invalid (i.e. NaN or Inf) "func" values. This is a user error
* info[7]= # function evaluations * info[7]= # function evaluations
* info[8]= # Jacobian evaluations * info[8]= # Jacobian evaluations
* info[9]= # linear systems solved, i.e. # attempts for reducing error
*/ */
LM_REAL *work, /* working memory at least LM_BLEC_DER_WORKSZ() reals large, allocated if NULL */ LM_REAL *work, /* working memory at least LM_BLEC_DER_WORKSZ() reals large, allocated if NULL */
LM_REAL *covar, /* O: Covariance matrix corresponding to LS solution; mxm. Set to NULL if not needed. */ LM_REAL *covar, /* O: Covariance matrix corresponding to LS solution; mxm. Set to NULL if not needed. */
...@@ -232,6 +232,12 @@ int LEVMAR_BLEC_DER( ...@@ -232,6 +232,12 @@ int LEVMAR_BLEC_DER(
return LM_ERROR; return LM_ERROR;
} }
if(!lb && !ub){
fprintf(stderr, RCAT(LCAT(LEVMAR_BLEC_DER, "(): lower and upper bounds for box constraints cannot be both NULL, use "),
LEVMAR_LEC_DER) "() in this case!\n");
return LM_ERROR;
}
if(!LEVMAR_BOX_CHECK(lb, ub, m)){ if(!LEVMAR_BOX_CHECK(lb, ub, m)){
fprintf(stderr, LCAT(LEVMAR_BLEC_DER, "(): at least one lower bound exceeds the upper one\n")); fprintf(stderr, LCAT(LEVMAR_BLEC_DER, "(): at least one lower bound exceeds the upper one\n"));
return LM_ERROR; return LM_ERROR;
...@@ -242,7 +248,7 @@ int LEVMAR_BLEC_DER( ...@@ -242,7 +248,7 @@ int LEVMAR_BLEC_DER(
data.x=(LM_REAL *)malloc((n+m)*sizeof(LM_REAL)); data.x=(LM_REAL *)malloc((n+m)*sizeof(LM_REAL));
if(!data.x){ if(!data.x){
fprintf(stderr, LCAT(LEVMAR_BLEC_DER, "(): memory allocation request #1 failed\n")); fprintf(stderr, LCAT(LEVMAR_BLEC_DER, "(): memory allocation request #1 failed\n"));
exit(1); return LM_ERROR;
} }
for(i=0; i<n; ++i) for(i=0; i<n; ++i)
...@@ -253,16 +259,20 @@ int LEVMAR_BLEC_DER( ...@@ -253,16 +259,20 @@ int LEVMAR_BLEC_DER(
else else
data.x=NULL; data.x=NULL;
data.w=(LM_REAL *)malloc(m*sizeof(LM_REAL) + m*sizeof(int)); data.w=(LM_REAL *)malloc(m*sizeof(LM_REAL) + m*sizeof(int)); /* should be arranged in that order for proper doubles alignment */
if(!data.w){ if(!data.w){
fprintf(stderr, LCAT(LEVMAR_BLEC_DER, "(): memory allocation request #2 failed\n")); fprintf(stderr, LCAT(LEVMAR_BLEC_DER, "(): memory allocation request #2 failed\n"));
exit(1); if(data.x) free(data.x);
return LM_ERROR;
} }
data.bctype=(int *)(data.w+m); data.bctype=(int *)(data.w+m);
/* note: at this point, one of lb, ub are not NULL */
for(i=0; i<m; ++i){ for(i=0; i<m; ++i){
data.w[i]=(!wghts)? __BC_WEIGHT__ : wghts[i]; data.w[i]=(!wghts)? __BC_WEIGHT__ : wghts[i];
if(ub[i]!=LM_REAL_MAX && lb[i]!=LM_REAL_MIN) data.bctype[i]=__BC_INTERVAL__; if(!lb) data.bctype[i]=__BC_HIGH__;
else if(!ub) data.bctype[i]=__BC_LOW__;
else if(ub[i]!=LM_REAL_MAX && lb[i]!=LM_REAL_MIN) data.bctype[i]=__BC_INTERVAL__;
else if(lb[i]!=LM_REAL_MIN) data.bctype[i]=__BC_LOW__; else if(lb[i]!=LM_REAL_MIN) data.bctype[i]=__BC_LOW__;
else data.bctype[i]=__BC_HIGH__; else data.bctype[i]=__BC_HIGH__;
} }
...@@ -318,6 +328,7 @@ int LEVMAR_BLEC_DIF( ...@@ -318,6 +328,7 @@ int LEVMAR_BLEC_DIF(
* 7 - stopped by invalid (i.e. NaN or Inf) "func" values. This is a user error * 7 - stopped by invalid (i.e. NaN or Inf) "func" values. This is a user error
* info[7]= # function evaluations * info[7]= # function evaluations
* info[8]= # Jacobian evaluations * info[8]= # Jacobian evaluations
* info[9]= # linear systems solved, i.e. # attempts for reducing error
*/ */
LM_REAL *work, /* working memory at least LM_BLEC_DIF_WORKSZ() reals large, allocated if NULL */ LM_REAL *work, /* working memory at least LM_BLEC_DIF_WORKSZ() reals large, allocated if NULL */
LM_REAL *covar, /* O: Covariance matrix corresponding to LS solution; mxm. Set to NULL if not needed. */ LM_REAL *covar, /* O: Covariance matrix corresponding to LS solution; mxm. Set to NULL if not needed. */
...@@ -330,6 +341,12 @@ int LEVMAR_BLEC_DIF( ...@@ -330,6 +341,12 @@ int LEVMAR_BLEC_DIF(
register int i; register int i;
LM_REAL locinfo[LM_INFO_SZ]; LM_REAL locinfo[LM_INFO_SZ];
if(!lb && !ub){
fprintf(stderr, RCAT(LCAT(LEVMAR_BLEC_DIF, "(): lower and upper bounds for box constraints cannot be both NULL, use "),
LEVMAR_LEC_DIF) "() in this case!\n");
return LM_ERROR;
}
if(!LEVMAR_BOX_CHECK(lb, ub, m)){ if(!LEVMAR_BOX_CHECK(lb, ub, m)){
fprintf(stderr, LCAT(LEVMAR_BLEC_DER, "(): at least one lower bound exceeds the upper one\n")); fprintf(stderr, LCAT(LEVMAR_BLEC_DER, "(): at least one lower bound exceeds the upper one\n"));
return LM_ERROR; return LM_ERROR;
...@@ -340,7 +357,7 @@ int LEVMAR_BLEC_DIF( ...@@ -340,7 +357,7 @@ int LEVMAR_BLEC_DIF(
data.x=(LM_REAL *)malloc((n+m)*sizeof(LM_REAL)); data.x=(LM_REAL *)malloc((n+m)*sizeof(LM_REAL));
if(!data.x){ if(!data.x){
fprintf(stderr, LCAT(LEVMAR_BLEC_DER, "(): memory allocation request #1 failed\n")); fprintf(stderr, LCAT(LEVMAR_BLEC_DER, "(): memory allocation request #1 failed\n"));
exit(1); return LM_ERROR;
} }
for(i=0; i<n; ++i) for(i=0; i<n; ++i)
...@@ -351,16 +368,20 @@ int LEVMAR_BLEC_DIF( ...@@ -351,16 +368,20 @@ int LEVMAR_BLEC_DIF(
else else
data.x=NULL; data.x=NULL;
data.w=(LM_REAL *)malloc(m*sizeof(LM_REAL) + m*sizeof(int)); data.w=(LM_REAL *)malloc(m*sizeof(LM_REAL) + m*sizeof(int)); /* should be arranged in that order for proper doubles alignment */
if(!data.w){ if(!data.w){
fprintf(stderr, LCAT(LEVMAR_BLEC_DER, "(): memory allocation request #2 failed\n")); fprintf(stderr, LCAT(LEVMAR_BLEC_DER, "(): memory allocation request #2 failed\n"));
exit(1); if(data.x) free(data.x);
return LM_ERROR;
} }
data.bctype=(int *)(data.w+m); data.bctype=(int *)(data.w+m);
/* note: at this point, one of lb, ub are not NULL */
for(i=0; i<m; ++i){ for(i=0; i<m; ++i){
data.w[i]=(!wghts)? __BC_WEIGHT__ : wghts[i]; data.w[i]=(!wghts)? __BC_WEIGHT__ : wghts[i];
if(ub[i]!=LM_REAL_MAX && lb[i]!=LM_REAL_MIN) data.bctype[i]=__BC_INTERVAL__; if(!lb) data.bctype[i]=__BC_HIGH__;
else if(!ub) data.bctype[i]=__BC_LOW__;
else if(ub[i]!=LM_REAL_MAX && lb[i]!=LM_REAL_MIN) data.bctype[i]=__BC_INTERVAL__;
else if(lb[i]!=LM_REAL_MIN) data.bctype[i]=__BC_LOW__; else if(lb[i]!=LM_REAL_MIN) data.bctype[i]=__BC_LOW__;
else data.bctype[i]=__BC_HIGH__; else data.bctype[i]=__BC_HIGH__;
} }
...@@ -385,7 +406,6 @@ int LEVMAR_BLEC_DIF( ...@@ -385,7 +406,6 @@ int LEVMAR_BLEC_DIF(
#undef LMBLEC_DATA #undef LMBLEC_DATA
#undef LMBLEC_FUNC #undef LMBLEC_FUNC
#undef LMBLEC_JACF #undef LMBLEC_JACF
#undef LEVMAR_FDIF_FORW_JAC_APPROX
#undef LEVMAR_COVAR #undef LEVMAR_COVAR
#undef LEVMAR_LEC_DER #undef LEVMAR_LEC_DER
#undef LEVMAR_LEC_DIF #undef LEVMAR_LEC_DIF
......
...@@ -685,7 +685,8 @@ char *probname[]={ ...@@ -685,7 +685,8 @@ char *probname[]={
}; };
opts[0]=LM_INIT_MU; opts[1]=1E-15; opts[2]=1E-15; opts[3]=1E-20; opts[0]=LM_INIT_MU; opts[1]=1E-15; opts[2]=1E-15; opts[3]=1E-20;
opts[4]=LM_DIFF_DELTA; // relevant only if the finite difference Jacobian version is used opts[4]=LM_DIFF_DELTA; // relevant only if the Jacobian is approximated using finite differences; specifies forward differencing
//opts[4]=-LM_DIFF_DELTA; // specifies central differencing to approximate Jacobian; more accurate but more expensive to compute!
/* uncomment the appropriate line below to select a minimization problem */ /* uncomment the appropriate line below to select a minimization problem */
problem= problem=
......
...@@ -35,9 +35,9 @@ ...@@ -35,9 +35,9 @@
#define LEVMAR_COVAR LM_ADD_PREFIX(levmar_covar) #define LEVMAR_COVAR LM_ADD_PREFIX(levmar_covar)
#define LEVMAR_FDIF_FORW_JAC_APPROX LM_ADD_PREFIX(levmar_fdif_forw_jac_approx) #define LEVMAR_FDIF_FORW_JAC_APPROX LM_ADD_PREFIX(levmar_fdif_forw_jac_approx)
#define GEQP3 LM_ADD_PREFIX(geqp3_) #define GEQP3 LM_MK_LAPACK_NAME(geqp3)
#define ORGQR LM_ADD_PREFIX(orgqr_) #define ORGQR LM_MK_LAPACK_NAME(orgqr)
#define TRTRI LM_ADD_PREFIX(trtri_) #define TRTRI LM_MK_LAPACK_NAME(trtri)
struct LMLEC_DATA{ struct LMLEC_DATA{
LM_REAL *c, *Z, *p, *jac; LM_REAL *c, *Z, *p, *jac;
...@@ -76,7 +76,7 @@ extern int TRTRI(char *uplo, char *diag, int *n, LM_REAL *a, int *lda, int *info ...@@ -76,7 +76,7 @@ extern int TRTRI(char *uplo, char *diag, int *n, LM_REAL *a, int *lda, int *info
* *
* The function accepts A, b and computes c, Y, Z. If b or c is NULL, c is not * The function accepts A, b and computes c, Y, Z. If b or c is NULL, c is not
* computed. Also, Y can be NULL in which case it is not referenced. * computed. Also, Y can be NULL in which case it is not referenced.
* The function returns 0 in case of error, A's computed rank if successfull * The function returns LM_ERROR in case of error, A's computed rank if successful
* *
*/ */
static int LMLEC_ELIM(LM_REAL *A, LM_REAL *b, LM_REAL *c, LM_REAL *Y, LM_REAL *Z, int m, int n) static int LMLEC_ELIM(LM_REAL *A, LM_REAL *b, LM_REAL *c, LM_REAL *Y, LM_REAL *Z, int m, int n)
...@@ -109,19 +109,23 @@ register int i, j, k; ...@@ -109,19 +109,23 @@ register int i, j, k;
r_sz=mintmn*mintmn; // actually smaller if a is not of full row rank r_sz=mintmn*mintmn; // actually smaller if a is not of full row rank
Y_sz=(Y)? 0 : tm*tn; Y_sz=(Y)? 0 : tm*tn;
tot_sz=jpvt_sz*sizeof(int) + (a_sz + tau_sz + r_sz + worksz + Y_sz)*sizeof(LM_REAL); tot_sz=(a_sz + tau_sz + r_sz + worksz + Y_sz)*sizeof(LM_REAL) + jpvt_sz*sizeof(int); /* should be arranged in that order for proper doubles alignment */
buf=(LM_REAL *)malloc(tot_sz); /* allocate a "big" memory chunk at once */ buf=(LM_REAL *)malloc(tot_sz); /* allocate a "big" memory chunk at once */
if(!buf){ if(!buf){
fprintf(stderr, RCAT("Memory allocation request failed in ", LMLEC_ELIM) "()\n"); fprintf(stderr, RCAT("Memory allocation request failed in ", LMLEC_ELIM) "()\n");
exit(1); return LM_ERROR;
} }
a=(LM_REAL *)buf; a=buf;
jpvt=(int *)(a+a_sz); tau=a+a_sz;
tau=(LM_REAL *)(jpvt + jpvt_sz);
r=tau+tau_sz; r=tau+tau_sz;
work=r+r_sz; work=r+r_sz;
if(!Y) Y=work+worksz; if(!Y){
Y=work+worksz;
jpvt=(int *)(Y+Y_sz);
}
else
jpvt=(int *)(work+worksz);
/* copy input array so that LAPACK won't destroy it. Note that copying is /* copy input array so that LAPACK won't destroy it. Note that copying is
* done in row-major order, which equals A^T in column-major * done in row-major order, which equals A^T in column-major
...@@ -139,13 +143,12 @@ register int i, j, k; ...@@ -139,13 +143,12 @@ register int i, j, k;
if(info!=0){ if(info!=0){
if(info<0){ if(info<0){
fprintf(stderr, RCAT(RCAT("LAPACK error: illegal value for argument %d of ", GEQP3) " in ", LMLEC_ELIM) "()\n", -info); fprintf(stderr, RCAT(RCAT("LAPACK error: illegal value for argument %d of ", GEQP3) " in ", LMLEC_ELIM) "()\n", -info);
exit(1);
} }
else if(info>0){ else if(info>0){
fprintf(stderr, RCAT(RCAT("unknown LAPACK error (%d) for ", GEQP3) " in ", LMLEC_ELIM) "()\n", info); fprintf(stderr, RCAT(RCAT("unknown LAPACK error (%d) for ", GEQP3) " in ", LMLEC_ELIM) "()\n", info);
free(buf);
return 0;
} }
free(buf);
return LM_ERROR;
} }
/* the upper triangular part of a now contains the upper triangle of the unpermuted R */ /* the upper triangular part of a now contains the upper triangle of the unpermuted R */
...@@ -168,7 +171,6 @@ register int i, j, k; ...@@ -168,7 +171,6 @@ register int i, j, k;
if(rank<tn){ if(rank<tn){
fprintf(stderr, RCAT("\nConstraints matrix in ", LMLEC_ELIM) "() is not of full row rank (i.e. %d < %d)!\n" fprintf(stderr, RCAT("\nConstraints matrix in ", LMLEC_ELIM) "() is not of full row rank (i.e. %d < %d)!\n"
"Make sure that you do not specify redundant or inconsistent constraints.\n\n", rank, tn); "Make sure that you do not specify redundant or inconsistent constraints.\n\n", rank, tn);
//exit(1);
free(buf); free(buf);
return LM_ERROR; return LM_ERROR;
} }
...@@ -188,13 +190,12 @@ register int i, j, k; ...@@ -188,13 +190,12 @@ register int i, j, k;
if(info!=0){ if(info!=0){
if(info<0){ if(info<0){
fprintf(stderr, RCAT(RCAT("LAPACK error: illegal value for argument %d of ", TRTRI) " in ", LMLEC_ELIM) "()\n", -info); fprintf(stderr, RCAT(RCAT("LAPACK error: illegal value for argument %d of ", TRTRI) " in ", LMLEC_ELIM) "()\n", -info);
exit(1);
} }
else if(info>0){ else if(info>0){
fprintf(stderr, RCAT(RCAT("A(%d, %d) is exactly zero for ", TRTRI) " (singular matrix) in ", LMLEC_ELIM) "()\n", info, info); fprintf(stderr, RCAT(RCAT("A(%d, %d) is exactly zero for ", TRTRI) " (singular matrix) in ", LMLEC_ELIM) "()\n", info, info);
free(buf);
return 0;
} }
free(buf);
return LM_ERROR;
} }
/* then, transpose r in place */ /* then, transpose r in place */
for(i=0; i<rank; ++i) for(i=0; i<rank; ++i)
...@@ -223,13 +224,12 @@ register int i, j, k; ...@@ -223,13 +224,12 @@ register int i, j, k;
if(info!=0){ if(info!=0){
if(info<0){ if(info<0){
fprintf(stderr, RCAT(RCAT("LAPACK error: illegal value for argument %d of ", ORGQR) " in ", LMLEC_ELIM) "()\n", -info); fprintf(stderr, RCAT(RCAT("LAPACK error: illegal value for argument %d of ", ORGQR) " in ", LMLEC_ELIM) "()\n", -info);
exit(1);
} }
else if(info>0){ else if(info>0){
fprintf(stderr, RCAT(RCAT("unknown LAPACK error (%d) for ", ORGQR) " in ", LMLEC_ELIM) "()\n", info); fprintf(stderr, RCAT(RCAT("unknown LAPACK error (%d) for ", ORGQR) " in ", LMLEC_ELIM) "()\n", info);
free(buf);
return 0;
} }
free(buf);
return LM_ERROR;
} }
/* compute Y=Q_1*R^-T*P^T. Y is tm x rank */ /* compute Y=Q_1*R^-T*P^T. Y is tm x rank */
...@@ -399,6 +399,7 @@ int LEVMAR_LEC_DER( ...@@ -399,6 +399,7 @@ int LEVMAR_LEC_DER(
* 7 - stopped by invalid (i.e. NaN or Inf) "func" values. This is a user error * 7 - stopped by invalid (i.e. NaN or Inf) "func" values. This is a user error
* info[7]= # function evaluations * info[7]= # function evaluations
* info[8]= # Jacobian evaluations * info[8]= # Jacobian evaluations
* info[9]= # linear systems solved, i.e. # attempts for reducing error
*/ */
LM_REAL *work, /* working memory at least LM_LEC_DER_WORKSZ() reals large, allocated if NULL */ LM_REAL *work, /* working memory at least LM_LEC_DER_WORKSZ() reals large, allocated if NULL */
LM_REAL *covar, /* O: Covariance matrix corresponding to LS solution; mxm. Set to NULL if not needed. */ LM_REAL *covar, /* O: Covariance matrix corresponding to LS solution; mxm. Set to NULL if not needed. */
...@@ -422,14 +423,14 @@ int LEVMAR_LEC_DER( ...@@ -422,14 +423,14 @@ int LEVMAR_LEC_DER(
mm=m-k; mm=m-k;
if(n<mm){ if(n<mm){
fprintf(stderr, LCAT(LEVMAR_LEC_DER, "(): cannot solve a problem with fewer measurements + constraints [%d + %d] than unknowns [%d]\n"), n, k, m); fprintf(stderr, LCAT(LEVMAR_LEC_DER, "(): cannot solve a problem with fewer measurements + equality constraints [%d + %d] than unknowns [%d]\n"), n, k, m);
return LM_ERROR; return LM_ERROR;
} }
ptr=(LM_REAL *)malloc((2*m + m*mm + n*m + mm)*sizeof(LM_REAL)); ptr=(LM_REAL *)malloc((2*m + m*mm + n*m + mm)*sizeof(LM_REAL));
if(!ptr){ if(!ptr){
fprintf(stderr, LCAT(LEVMAR_LEC_DER, "(): memory allocation request failed\n")); fprintf(stderr, LCAT(LEVMAR_LEC_DER, "(): memory allocation request failed\n"));
exit(1); return LM_ERROR;
} }
data.p=p; data.p=p;
p0=ptr; p0=ptr;
...@@ -530,6 +531,7 @@ int LEVMAR_LEC_DIF( ...@@ -530,6 +531,7 @@ int LEVMAR_LEC_DIF(
* 7 - stopped by invalid (i.e. NaN or Inf) "func" values. This is a user error * 7 - stopped by invalid (i.e. NaN or Inf) "func" values. This is a user error
* info[7]= # function evaluations * info[7]= # function evaluations
* info[8]= # Jacobian evaluations * info[8]= # Jacobian evaluations
* info[9]= # linear systems solved, i.e. # attempts for reducing error
*/ */
LM_REAL *work, /* working memory at least LM_LEC_DIF_WORKSZ() reals large, allocated if NULL */ LM_REAL *work, /* working memory at least LM_LEC_DIF_WORKSZ() reals large, allocated if NULL */
LM_REAL *covar, /* O: Covariance matrix corresponding to LS solution; mxm. Set to NULL if not needed. */ LM_REAL *covar, /* O: Covariance matrix corresponding to LS solution; mxm. Set to NULL if not needed. */
...@@ -547,14 +549,14 @@ int LEVMAR_LEC_DIF( ...@@ -547,14 +549,14 @@ int LEVMAR_LEC_DIF(
mm=m-k; mm=m-k;
if(n<mm){ if(n<mm){
fprintf(stderr, LCAT(LEVMAR_LEC_DIF, "(): cannot solve a problem with fewer measurements + constraints [%d + %d] than unknowns [%d]\n"), n, k, m); fprintf(stderr, LCAT(LEVMAR_LEC_DIF, "(): cannot solve a problem with fewer measurements + equality constraints [%d + %d] than unknowns [%d]\n"), n, k, m);
return LM_ERROR; return LM_ERROR;
} }
ptr=(LM_REAL *)malloc((2*m + m*mm + mm)*sizeof(LM_REAL)); ptr=(LM_REAL *)malloc((2*m + m*mm + mm)*sizeof(LM_REAL));
if(!ptr){ if(!ptr){
fprintf(stderr, LCAT(LEVMAR_LEC_DIF, "(): memory allocation request failed\n")); fprintf(stderr, LCAT(LEVMAR_LEC_DIF, "(): memory allocation request failed\n"));
exit(1); return LM_ERROR;
} }
data.p=p; data.p=p;
p0=ptr; p0=ptr;
...@@ -618,7 +620,8 @@ int LEVMAR_LEC_DIF( ...@@ -618,7 +620,8 @@ int LEVMAR_LEC_DIF(
hx=(LM_REAL *)malloc((2*n+n*m)*sizeof(LM_REAL)); hx=(LM_REAL *)malloc((2*n+n*m)*sizeof(LM_REAL));
if(!hx){ if(!hx){
fprintf(stderr, LCAT(LEVMAR_LEC_DIF, "(): memory allocation request failed\n")); fprintf(stderr, LCAT(LEVMAR_LEC_DIF, "(): memory allocation request failed\n"));
exit(1); free(ptr);
return LM_ERROR;
} }
wrk=hx+n; wrk=hx+n;
......
...@@ -20,7 +20,13 @@ ...@@ -20,7 +20,13 @@
#ifndef _MISC_H_ #ifndef _MISC_H_
#define _MISC_H_ #define _MISC_H_
/* common prefix for BLAS subroutines. Leave undefined in case of no prefix. You might also need to modify LM_BLAS_PREFIX below */ /* common suffix for LAPACK subroutines. Define empty in case of no prefix. */
#define LM_LAPACK_SUFFIX _
//#define LM_LAPACK_SUFFIX // define empty
/* common prefix for BLAS subroutines. Leave undefined in case of no prefix.
* You might also need to modify LM_BLAS_PREFIX below
*/
/* f2c'd BLAS */ /* f2c'd BLAS */
//#define LM_BLAS_PREFIX f2c_ //#define LM_BLAS_PREFIX f2c_
/* C BLAS */ /* C BLAS */
...@@ -36,6 +42,9 @@ ...@@ -36,6 +42,9 @@
#define RCAT_(a, b) a #b #define RCAT_(a, b) a #b
#define RCAT(a, b) RCAT_(a, b) // force substitution #define RCAT(a, b) RCAT_(a, b) // force substitution
#define LM_MK_LAPACK_NAME(s) LM_ADD_PREFIX(LM_CAT_(s, LM_LAPACK_SUFFIX))
#define __BLOCKSZ__ 32 /* block size for cache-friendly matrix-matrix multiply. It should be #define __BLOCKSZ__ 32 /* block size for cache-friendly matrix-matrix multiply. It should be
* such that __BLOCKSZ__^2*sizeof(LM_REAL) is smaller than the CPU (L1) * such that __BLOCKSZ__^2*sizeof(LM_REAL) is smaller than the CPU (L1)
* data cache size. Notice that a value of 32 when LM_REAL=double assumes * data cache size. Notice that a value of 32 when LM_REAL=double assumes
......
...@@ -30,6 +30,7 @@ ...@@ -30,6 +30,7 @@
#define LEVMAR_COVAR LM_ADD_PREFIX(levmar_covar) #define LEVMAR_COVAR LM_ADD_PREFIX(levmar_covar)
#define LEVMAR_STDDEV LM_ADD_PREFIX(levmar_stddev) #define LEVMAR_STDDEV LM_ADD_PREFIX(levmar_stddev)
#define LEVMAR_CORCOEF LM_ADD_PREFIX(levmar_corcoef) #define LEVMAR_CORCOEF LM_ADD_PREFIX(levmar_corcoef)
#define LEVMAR_R2 LM_ADD_PREFIX(levmar_R2)
#define LEVMAR_BOX_CHECK LM_ADD_PREFIX(levmar_box_check) #define LEVMAR_BOX_CHECK LM_ADD_PREFIX(levmar_box_check)
#define LEVMAR_L2NRMXMY LM_ADD_PREFIX(levmar_L2nrmxmy) #define LEVMAR_L2NRMXMY LM_ADD_PREFIX(levmar_L2nrmxmy)
...@@ -47,8 +48,8 @@ static int LEVMAR_PSEUDOINVERSE(LM_REAL *A, LM_REAL *B, int m); ...@@ -47,8 +48,8 @@ static int LEVMAR_PSEUDOINVERSE(LM_REAL *A, LM_REAL *B, int m);
extern void GEMM(char *transa, char *transb, int *m, int *n, int *k, extern void GEMM(char *transa, char *transb, int *m, int *n, int *k,
LM_REAL *alpha, LM_REAL *a, int *lda, LM_REAL *b, int *ldb, LM_REAL *beta, LM_REAL *c, int *ldc); LM_REAL *alpha, LM_REAL *a, int *lda, LM_REAL *b, int *ldb, LM_REAL *beta, LM_REAL *c, int *ldc);
#define GESVD LM_ADD_PREFIX(gesvd_) #define GESVD LM_MK_LAPACK_NAME(gesvd)
#define GESDD LM_ADD_PREFIX(gesdd_) #define GESDD LM_MK_LAPACK_NAME(gesdd)
extern int GESVD(char *jobu, char *jobvt, int *m, int *n, LM_REAL *a, int *lda, LM_REAL *s, LM_REAL *u, int *ldu, extern int GESVD(char *jobu, char *jobvt, int *m, int *n, LM_REAL *a, int *lda, LM_REAL *s, LM_REAL *u, int *ldu,
LM_REAL *vt, int *ldvt, LM_REAL *work, int *lwork, int *info); LM_REAL *vt, int *ldvt, LM_REAL *work, int *lwork, int *info);
...@@ -56,8 +57,8 @@ extern int GESVD(char *jobu, char *jobvt, int *m, int *n, LM_REAL *a, int *lda, ...@@ -56,8 +57,8 @@ extern int GESVD(char *jobu, char *jobvt, int *m, int *n, LM_REAL *a, int *lda,
extern int GESDD(char *jobz, int *m, int *n, LM_REAL *a, int *lda, LM_REAL *s, LM_REAL *u, int *ldu, LM_REAL *vt, int *ldvt, extern int GESDD(char *jobz, int *m, int *n, LM_REAL *a, int *lda, LM_REAL *s, LM_REAL *u, int *ldu, LM_REAL *vt, int *ldvt,
LM_REAL *work, int *lwork, int *iwork, int *info); LM_REAL *work, int *lwork, int *iwork, int *info);
/* cholesky decomposition */ /* Cholesky decomposition */
#define POTF2 LM_ADD_PREFIX(potf2_) #define POTF2 LM_MK_LAPACK_NAME(potf2)
extern int POTF2(char *uplo, int *n, LM_REAL *a, int *lda, int *info); extern int POTF2(char *uplo, int *n, LM_REAL *a, int *lda, int *info);
#define LEVMAR_CHOLESKY LM_ADD_PREFIX(levmar_chol) #define LEVMAR_CHOLESKY LM_ADD_PREFIX(levmar_chol)
...@@ -70,7 +71,7 @@ static int LEVMAR_LUINVERSE(LM_REAL *A, LM_REAL *B, int m); ...@@ -70,7 +71,7 @@ static int LEVMAR_LUINVERSE(LM_REAL *A, LM_REAL *B, int m);
/* blocked multiplication of the transpose of the nxm matrix a with itself (i.e. a^T a) /* blocked multiplication of the transpose of the nxm matrix a with itself (i.e. a^T a)
* using a block size of bsize. The product is returned in b. * using a block size of bsize. The product is returned in b.
* Since a^T a is symmetric, its computation can be speeded up by computing only its * Since a^T a is symmetric, its computation can be sped up by computing only its
* upper triangular part and copying it to the lower part. * upper triangular part and copying it to the lower part.
* *
* More details on blocking can be found at * More details on blocking can be found at
...@@ -323,7 +324,7 @@ int fvec_sz=n, fjac_sz=n*m, pp_sz=m, fvecp_sz=n; ...@@ -323,7 +324,7 @@ int fvec_sz=n, fjac_sz=n*m, pp_sz=m, fvecp_sz=n;
* into B using SVD. A and B can coincide * into B using SVD. A and B can coincide
* *
* The function returns 0 in case of error (e.g. A is singular), * The function returns 0 in case of error (e.g. A is singular),
* the rank of A if successfull * the rank of A if successful
* *
* A, B are mxm * A, B are mxm
* *
...@@ -341,32 +342,33 @@ LM_REAL thresh, one_over_denom; ...@@ -341,32 +342,33 @@ LM_REAL thresh, one_over_denom;
int info, rank, worksz, *iwork, iworksz; int info, rank, worksz, *iwork, iworksz;
/* calculate required memory size */ /* calculate required memory size */
worksz=16*m; /* more than needed */ worksz=5*m; // min worksize for GESVD
//worksz=m*(7*m+4); // min worksize for GESDD
iworksz=8*m; iworksz=8*m;
a_sz=m*m; a_sz=m*m;
u_sz=m*m; s_sz=m; vt_sz=m*m; u_sz=m*m; s_sz=m; vt_sz=m*m;
tot_sz=iworksz*sizeof(int) + (a_sz + u_sz + s_sz + vt_sz + worksz)*sizeof(LM_REAL); tot_sz=(a_sz + u_sz + s_sz + vt_sz + worksz)*sizeof(LM_REAL) + iworksz*sizeof(int); /* should be arranged in that order for proper doubles alignment */
buf_sz=tot_sz; buf_sz=tot_sz;
buf=(LM_REAL *)malloc(buf_sz); buf=(LM_REAL *)malloc(buf_sz);
if(!buf){ if(!buf){
fprintf(stderr, RCAT("memory allocation in ", LEVMAR_PSEUDOINVERSE) "() failed!\n"); fprintf(stderr, RCAT("memory allocation in ", LEVMAR_PSEUDOINVERSE) "() failed!\n");
exit(1); return 0; /* error */
} }
iwork=(int *)buf; a=buf;
a=(LM_REAL *)(iwork+iworksz); u=a+a_sz;
s=u+u_sz;
vt=s+s_sz;
work=vt+vt_sz;
iwork=(int *)(work+worksz);
/* store A (column major!) into a */ /* store A (column major!) into a */
for(i=0; i<m; i++) for(i=0; i<m; i++)
for(j=0; j<m; j++) for(j=0; j<m; j++)
a[i+j*m]=A[i*m+j]; a[i+j*m]=A[i*m+j];
u=a + a_sz;
s=u+u_sz;
vt=s+s_sz;
work=vt+vt_sz;
/* SVD decomposition of A */ /* SVD decomposition of A */
GESVD("A", "A", (int *)&m, (int *)&m, a, (int *)&m, s, u, (int *)&m, vt, (int *)&m, work, (int *)&worksz, &info); GESVD("A", "A", (int *)&m, (int *)&m, a, (int *)&m, s, u, (int *)&m, vt, (int *)&m, work, (int *)&worksz, &info);
//GESDD("A", (int *)&m, (int *)&m, a, (int *)&m, s, u, (int *)&m, vt, (int *)&m, work, (int *)&worksz, iwork, &info); //GESDD("A", (int *)&m, (int *)&m, a, (int *)&m, s, u, (int *)&m, vt, (int *)&m, work, (int *)&worksz, iwork, &info);
...@@ -375,14 +377,12 @@ int info, rank, worksz, *iwork, iworksz; ...@@ -375,14 +377,12 @@ int info, rank, worksz, *iwork, iworksz;
if(info!=0){ if(info!=0){
if(info<0){ if(info<0){
fprintf(stderr, RCAT(RCAT(RCAT("LAPACK error: illegal value for argument %d of ", GESVD), "/" GESDD) " in ", LEVMAR_PSEUDOINVERSE) "()\n", -info); fprintf(stderr, RCAT(RCAT(RCAT("LAPACK error: illegal value for argument %d of ", GESVD), "/" GESDD) " in ", LEVMAR_PSEUDOINVERSE) "()\n", -info);
exit(1);
} }
else{ else{
fprintf(stderr, RCAT("LAPACK error: dgesdd (dbdsdc)/dgesvd (dbdsqr) failed to converge in ", LEVMAR_PSEUDOINVERSE) "() [info=%d]\n", info); fprintf(stderr, RCAT("LAPACK error: dgesdd (dbdsdc)/dgesvd (dbdsqr) failed to converge in ", LEVMAR_PSEUDOINVERSE) "() [info=%d]\n", info);
free(buf);
return 0;
} }
free(buf);
return 0;
} }
if(eps<0.0){ if(eps<0.0){
...@@ -421,8 +421,7 @@ static int SVDINV(LM_REAL *a, LM_REAL *b, int m); ...@@ -421,8 +421,7 @@ static int SVDINV(LM_REAL *a, LM_REAL *b, int m);
* *
* A and B are mxm * A and B are mxm
* *
* The function returns 0 in case of error, * The function returns 0 in case of error, 1 if successful
* 1 if successfull
* *
*/ */
static int LEVMAR_LUINVERSE(LM_REAL *A, LM_REAL *B, int m) static int LEVMAR_LUINVERSE(LM_REAL *A, LM_REAL *B, int m)
...@@ -439,19 +438,19 @@ LM_REAL *a, *x, *work, max, sum, tmp; ...@@ -439,19 +438,19 @@ LM_REAL *a, *x, *work, max, sum, tmp;
a_sz=m*m; a_sz=m*m;
x_sz=m; x_sz=m;
work_sz=m; work_sz=m;
tot_sz=idx_sz*sizeof(int) + (a_sz+x_sz+work_sz)*sizeof(LM_REAL); tot_sz=(a_sz + x_sz + work_sz)*sizeof(LM_REAL) + idx_sz*sizeof(int); /* should be arranged in that order for proper doubles alignment */
buf_sz=tot_sz; buf_sz=tot_sz;
buf=(void *)malloc(tot_sz); buf=(void *)malloc(tot_sz);
if(!buf){ if(!buf){
fprintf(stderr, RCAT("memory allocation in ", LEVMAR_LUINVERSE) "() failed!\n"); fprintf(stderr, RCAT("memory allocation in ", LEVMAR_LUINVERSE) "() failed!\n");
exit(1); return 0; /* error */
} }
idx=(int *)buf; a=buf;
a=(LM_REAL *)(idx + idx_sz); x=a+a_sz;
x=a + a_sz; work=x+x_sz;
work=x + x_sz; idx=(int *)(work+work_sz);
/* avoid destroying A by copying it to a */ /* avoid destroying A by copying it to a */
for(i=0; i<a_sz; ++i) a[i]=A[i]; for(i=0; i<a_sz; ++i) a[i]=A[i];
...@@ -623,6 +622,44 @@ LM_REAL LEVMAR_CORCOEF(LM_REAL *covar, int m, int i, int j) ...@@ -623,6 +622,44 @@ LM_REAL LEVMAR_CORCOEF(LM_REAL *covar, int m, int i, int j)
return (LM_REAL)(covar[i*m+j]/sqrt(covar[i*m+i]*covar[j*m+j])); return (LM_REAL)(covar[i*m+j]/sqrt(covar[i*m+i]*covar[j*m+j]));
} }
/* coefficient of determination.
* see http://en.wikipedia.org/wiki/Coefficient_of_determination
*/
LM_REAL LEVMAR_R2(void (*func)(LM_REAL *p, LM_REAL *hx, int m, int n, void *adata),
LM_REAL *p, LM_REAL *x, int m, int n, void *adata)
{
register int i;
register LM_REAL tmp;
LM_REAL SSerr, // sum of squared errors, i.e. residual sum of squares \sum_i (x_i-hx_i)^2
SStot, // \sum_i (x_i-xavg)^2
*hx, xavg;
if((hx=(LM_REAL *)malloc(n*sizeof(LM_REAL)))==NULL){
fprintf(stderr, RCAT("memory allocation request failed in ", LEVMAR_R2) "()\n");
exit(1);
}
/* hx=f(p) */
(*func)(p, hx, m, n, adata);
for(i=0, tmp=0.0; i<n; ++i)
tmp+=x[i];
xavg=tmp/(LM_REAL)n;
for(i=0, SSerr=SStot=0.0; i<n; ++i){
tmp=x[i]-hx[i];
SSerr+=tmp*tmp;
tmp=x[i]-xavg;
SStot+=tmp*tmp;
}
free(hx);
return LM_CNST(1.0) - SSerr/SStot;
}
/* check box constraints for consistency */ /* check box constraints for consistency */
int LEVMAR_BOX_CHECK(LM_REAL *lb, LM_REAL *ub, int m) int LEVMAR_BOX_CHECK(LM_REAL *lb, LM_REAL *ub, int m)
{ {
...@@ -638,30 +675,36 @@ register int i; ...@@ -638,30 +675,36 @@ register int i;
#ifdef HAVE_LAPACK #ifdef HAVE_LAPACK
/* compute the Cholesky decompostion of C in W, s.t. C=W^t W and W is upper triangular */ /* compute the Cholesky decomposition of C in W, s.t. C=W^t W and W is upper triangular */
int LEVMAR_CHOLESKY(LM_REAL *C, LM_REAL *W, int m) int LEVMAR_CHOLESKY(LM_REAL *C, LM_REAL *W, int m)
{ {
register int i, j; register int i, j;
int info; int info;
/* copy weights array C to W (in column-major order!) so that LAPACK won't destroy it */ /* compute the Cholesky decomposition of C in W, s.t. C=W^t W and W is upper triangular */
for(i=0; i<m; i++) int LEVMAR_CHOLESKY(LM_REAL *C, LM_REAL *W, int m)
for(j=0; j<m; j++) {
W[i+j*m]=C[i*m+j]; register int i, j;
int info;
/* cholesky decomposition */ /* copy weights array C to W so that LAPACK won't destroy it;
* C is assumed symmetric, hence no transposition is needed
*/
for(i=0, j=m*m; i<j; ++i)
W[i]=C[i];
/* Cholesky decomposition */
POTF2("U", (int *)&m, W, (int *)&m, (int *)&info); POTF2("U", (int *)&m, W, (int *)&m, (int *)&info);
/* error treatment */ /* error treatment */
if(info!=0){ if(info!=0){
if(info<0){ if(info<0){
fprintf(stderr, "LAPACK error: illegal value for argument %d of dpotf2 in %s\n", -info, LCAT(LEVMAR_DER, "()")); fprintf(stderr, "LAPACK error: illegal value for argument %d of dpotf2 in %s\n", -info, LCAT(LEVMAR_CHOLESKY, "()"));
exit(1); }
} else{
else{ fprintf(stderr, "LAPACK error: the leading minor of order %d is not positive definite,\n%s()\n", info,
fprintf(stderr, "LAPACK error: the leading minor of order %d is not positive definite,\n%s()\n", info, RCAT("and the Cholesky factorization could not be completed in ", LEVMAR_CHOLESKY));
RCAT("and the cholesky factorization could not be completed in ", LEVMAR_CHOLESKY)); }
return LM_ERROR; return LM_ERROR;
}
} }
/* the decomposition is in the upper part of W (in column-major order!). /* the decomposition is in the upper part of W (in column-major order!).
...@@ -699,17 +742,17 @@ register LM_REAL sum0=0.0, sum1=0.0, sum2=0.0, sum3=0.0; ...@@ -699,17 +742,17 @@ register LM_REAL sum0=0.0, sum1=0.0, sum2=0.0, sum3=0.0;
*/ */
blockn = (n>>bpwr)<<bpwr; /* (n / blocksize) * blocksize; */ blockn = (n>>bpwr)<<bpwr; /* (n / blocksize) * blocksize; */
/* unroll the loop in blocks of `blocksize'; looping downwards gains some more speed */
if(x){ if(x){
/* unroll the loop in blocks of `blocksize' */ for(i=blockn-1; i>0; i-=blocksize){
for(i=0; i<blockn; i+=blocksize){
e[i ]=x[i ]-y[i ]; sum0+=e[i ]*e[i ]; e[i ]=x[i ]-y[i ]; sum0+=e[i ]*e[i ];
j1=i+1; e[j1]=x[j1]-y[j1]; sum1+=e[j1]*e[j1]; j1=i-1; e[j1]=x[j1]-y[j1]; sum1+=e[j1]*e[j1];
j2=i+2; e[j2]=x[j2]-y[j2]; sum2+=e[j2]*e[j2]; j2=i-2; e[j2]=x[j2]-y[j2]; sum2+=e[j2]*e[j2];
j3=i+3; e[j3]=x[j3]-y[j3]; sum3+=e[j3]*e[j3]; j3=i-3; e[j3]=x[j3]-y[j3]; sum3+=e[j3]*e[j3];
j4=i+4; e[j4]=x[j4]-y[j4]; sum0+=e[j4]*e[j4]; j4=i-4; e[j4]=x[j4]-y[j4]; sum0+=e[j4]*e[j4];
j5=i+5; e[j5]=x[j5]-y[j5]; sum1+=e[j5]*e[j5]; j5=i-5; e[j5]=x[j5]-y[j5]; sum1+=e[j5]*e[j5];
j6=i+6; e[j6]=x[j6]-y[j6]; sum2+=e[j6]*e[j6]; j6=i-6; e[j6]=x[j6]-y[j6]; sum2+=e[j6]*e[j6];
j7=i+7; e[j7]=x[j7]-y[j7]; sum3+=e[j7]*e[j7]; j7=i-7; e[j7]=x[j7]-y[j7]; sum3+=e[j7]*e[j7];
} }
/* /*
...@@ -718,6 +761,7 @@ register LM_REAL sum0=0.0, sum1=0.0, sum2=0.0, sum3=0.0; ...@@ -718,6 +761,7 @@ register LM_REAL sum0=0.0, sum1=0.0, sum2=0.0, sum3=0.0;
* but a switch is faster (and more interesting) * but a switch is faster (and more interesting)
*/ */
i=blockn;
if(i<n){ if(i<n){
/* Jump into the case at the place that will allow /* Jump into the case at the place that will allow
* us to finish off the appropriate number of items. * us to finish off the appropriate number of items.
...@@ -735,16 +779,15 @@ register LM_REAL sum0=0.0, sum1=0.0, sum2=0.0, sum3=0.0; ...@@ -735,16 +779,15 @@ register LM_REAL sum0=0.0, sum1=0.0, sum2=0.0, sum3=0.0;
} }
} }
else{ /* x==0 */ else{ /* x==0 */
/* unroll the loop in blocks of `blocksize' */ for(i=blockn-1; i>0; i-=blocksize){
for(i=0; i<blockn; i+=blocksize){
e[i ]=-y[i ]; sum0+=e[i ]*e[i ]; e[i ]=-y[i ]; sum0+=e[i ]*e[i ];
j1=i+1; e[j1]=-y[j1]; sum1+=e[j1]*e[j1]; j1=i-1; e[j1]=-y[j1]; sum1+=e[j1]*e[j1];
j2=i+2; e[j2]=-y[j2]; sum2+=e[j2]*e[j2]; j2=i-2; e[j2]=-y[j2]; sum2+=e[j2]*e[j2];
j3=i+3; e[j3]=-y[j3]; sum3+=e[j3]*e[j3]; j3=i-3; e[j3]=-y[j3]; sum3+=e[j3]*e[j3];
j4=i+4; e[j4]=-y[j4]; sum0+=e[j4]*e[j4]; j4=i-4; e[j4]=-y[j4]; sum0+=e[j4]*e[j4];
j5=i+5; e[j5]=-y[j5]; sum1+=e[j5]*e[j5]; j5=i-5; e[j5]=-y[j5]; sum1+=e[j5]*e[j5];
j6=i+6; e[j6]=-y[j6]; sum2+=e[j6]*e[j6]; j6=i-6; e[j6]=-y[j6]; sum2+=e[j6]*e[j6];
j7=i+7; e[j7]=-y[j7]; sum3+=e[j7]*e[j7]; j7=i-7; e[j7]=-y[j7]; sum3+=e[j7]*e[j7];
} }
/* /*
...@@ -753,6 +796,7 @@ register LM_REAL sum0=0.0, sum1=0.0, sum2=0.0, sum3=0.0; ...@@ -753,6 +796,7 @@ register LM_REAL sum0=0.0, sum1=0.0, sum2=0.0, sum3=0.0;
* but a switch is faster (and more interesting) * but a switch is faster (and more interesting)
*/ */
i=blockn;
if(i<n){ if(i<n){
/* Jump into the case at the place that will allow /* Jump into the case at the place that will allow
* us to finish off the appropriate number of items. * us to finish off the appropriate number of items.
...@@ -794,7 +838,7 @@ static int SVDINV(LM_REAL *a, LM_REAL *b, int m) ...@@ -794,7 +838,7 @@ static int SVDINV(LM_REAL *a, LM_REAL *b, int m)
(ct=bt/at,at*sqrt(1.0+ct*ct)) \ (ct=bt/at,at*sqrt(1.0+ct*ct)) \
: (bt ? (ct=at/bt,bt*sqrt(1.0+ct*ct)): 0.0)) : (bt ? (ct=at/bt,bt*sqrt(1.0+ct*ct)): 0.0))
#define SIGN(a,b) ((b) >= 0.0 ? fabs(a) : -fabs(a)) #define SIGN(a,b) ((b) >= 0.0 ? fabs(a) : -fabs(a))
#define TOL 1.0e-11 #define TOL 1.0e-6
int flag,i,its,j,jj,k,l,mmi,nm, nml, rank; int flag,i,its,j,jj,k,l,mmi,nm, nml, rank;
LM_REAL *vmat,*wmat, LM_REAL *vmat,*wmat,
...@@ -804,7 +848,6 @@ static int SVDINV(LM_REAL *a, LM_REAL *b, int m) ...@@ -804,7 +848,6 @@ static int SVDINV(LM_REAL *a, LM_REAL *b, int m)
anorm, g, scale, anorm, g, scale,
at,bt,ct,maxarg1,maxarg2, at,bt,ct,maxarg1,maxarg2,
thresh, wmax; thresh, wmax;
anorm = g = scale = 0.0; anorm = g = scale = 0.0;
rv1=(LM_REAL *)malloc(m*sizeof(LM_REAL)); rv1=(LM_REAL *)malloc(m*sizeof(LM_REAL));
...@@ -953,7 +996,7 @@ static int SVDINV(LM_REAL *a, LM_REAL *b, int m) ...@@ -953,7 +996,7 @@ static int SVDINV(LM_REAL *a, LM_REAL *b, int m)
for (l=k;l>=0;l--) for (l=k;l>=0;l--)
{ {
nm=l-1; nm=l-1;
if (fabs(rv1[l]) <= anorm*TOL) if (!l || fabs(rv1[l]) <= anorm*TOL)
{ {
flag=0; flag=0;
break; break;
...@@ -1103,6 +1146,7 @@ static int SVDINV(LM_REAL *a, LM_REAL *b, int m) ...@@ -1103,6 +1146,7 @@ static int SVDINV(LM_REAL *a, LM_REAL *b, int m)
#undef LEVMAR_COVAR #undef LEVMAR_COVAR
#undef LEVMAR_STDDEV #undef LEVMAR_STDDEV
#undef LEVMAR_CORCOEF #undef LEVMAR_CORCOEF
#undef LEVMAR_R2
#undef LEVMAR_CHKJAC #undef LEVMAR_CHKJAC
#undef LEVMAR_FDIF_FORW_JAC_APPROX #undef LEVMAR_FDIF_FORW_JAC_APPROX
#undef LEVMAR_FDIF_CENT_JAC_APPROX #undef LEVMAR_FDIF_CENT_JAC_APPROX
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment