Example 09: Least Squares

This example demonstrates solving overdetermined (\(m > n\)) and underdetermined (\(m < n\)) linear systems using least squares.

Key Concepts

Overdetermined Systems: Finding \(x\) that minimizes \(\|Ax - B\|_2\).
Underdetermined Systems: Finding the minimum norm solution \(x\) that satisfies \(Ax = B\).
Simplified API: Using slate::least_squares_solve which handles both cases automatically.
Traditional API: Using slate::gels.

C++ Example

Overdetermined Least Squares (Lines 27-46)

slate::Matrix<scalar_type> A( m, n, nb, ... );
slate::Matrix<scalar_type> BX( max_mn, nrhs, nb, ... );

// BX contains B on input
auto B = BX; // View of top m rows
auto X = BX.slice( 0, n-1, 0, nrhs-1 ); // View where X will be

slate::least_squares_solve( A, BX );

For overdetermined systems (\(m \ge n\)):

A is m by n.
The RHS matrix BX must be large enough to hold both the input B (m rows) and the result X (conceptually n rows, though in the algorithm B is overwritten in place). Since m >= n, m rows is sufficient.
least_squares_solve (gels) overwrites A with QR factors and BX with the solution.

Underdetermined Least Squares (Lines 59-82)

// solve A^H X = B
auto AH = conj_transpose( A );
slate::least_squares_solve( AH, BX );

For underdetermined systems (\(m < n\)), we typically solve \(A x = B\) (minimum norm solution). SLATE’s gels routine expects an m by n matrix where m >= n. To solve the underdetermined case \(A x = B\) where A is fat (m < n), we mathematically transform this into a problem involving \(A^H\) (which is tall).

The example demonstrates solving \(A^H X = B\) where A is tall (m > n), which effectively simulates an underdetermined system from the perspective of the transposed matrix.
BX must be size max(m, n) by nrhs. Since the solution vector X will be larger than the input B, BX provides the necessary space.

// ex09_least_squares.cc
// Solve over- and under-determined AX = B

/// !!!   Lines between `//---------- begin label`          !!!
/// !!!             and `//---------- end label`            !!!
/// !!!   are included in the SLATE Users' Guide.           !!!

#include <slate/slate.hh>

#include "util.hh"

int mpi_size = 0;
int mpi_rank = 0;
int grid_p = 0;
int grid_q = 0;

//------------------------------------------------------------------------------
template <typename scalar_type>
void test_gels_overdetermined()
{
    print_func( mpi_rank );

    int64_t m=2000, n=1000, nrhs=100, nb=256;

    //---------- begin over1
    int64_t max_mn = std::max( m, n );
    slate::Matrix<scalar_type> A( m, n, nb, grid_p, grid_q, MPI_COMM_WORLD );
    slate::Matrix<scalar_type> BX( max_mn, nrhs, nb, grid_p, grid_q, MPI_COMM_WORLD );
    // ...
    //---------- end over1

    A.insertLocalTiles();
    BX.insertLocalTiles();
    //---------- begin over2
    auto B = BX;  // == BX.slice( 0, m-1, 0, nrhs-1 );
    auto X = BX.slice( 0, n-1, 0, nrhs-1 );
    //---------- end over2
    random_matrix( A );
    random_matrix( B );

    //---------- begin over3

    // solve AX = B, solution in X
    slate::least_squares_solve( A, BX );  // simplified API

    slate::gels( A, BX );                 // traditional API
    //---------- end over3
}

//------------------------------------------------------------------------------
template <typename scalar_type>
void test_gels_underdetermined()
{
    print_func( mpi_rank );

    int64_t m=2000, n=1000, nrhs=100, nb=256;

    //---------- begin under1
    int64_t max_mn = std::max( m, n );
    slate::Matrix<scalar_type> A( m, n, nb, grid_p, grid_q, MPI_COMM_WORLD );
    slate::Matrix<scalar_type> BX( max_mn, nrhs, nb, grid_p, grid_q, MPI_COMM_WORLD );
    // ...
    //---------- end under1

    A.insertLocalTiles();
    BX.insertLocalTiles();

    //---------- begin under2
    auto B = BX.slice( 0, n-1, 0, nrhs-1 );
    auto X = BX;  // == BX.slice( 0, m-1, 0, nrhs-1 );
    //---------- end under2

    random_matrix( A );
    random_matrix( B );

    //---------- begin under3

    // solve A^H X = B, solution in X
    auto AH = conj_transpose( A );
    slate::least_squares_solve( AH, BX );  // simplified API

    slate::gels( AH, BX );                 // traditional API
    //---------- end under3
}

//------------------------------------------------------------------------------
int main( int argc, char** argv )
{
    try {
        // Parse command line to set types for s, d, c, z precisions.
        bool types[ 4 ];
        parse_args( argc, argv, types );

        int provided = 0;
        slate_mpi_call(
            MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &provided ) );
        assert( provided == MPI_THREAD_MULTIPLE );

        slate_mpi_call(
            MPI_Comm_size( MPI_COMM_WORLD, &mpi_size ) );

        slate_mpi_call(
            MPI_Comm_rank( MPI_COMM_WORLD, &mpi_rank ) );

        // Determine p-by-q grid for this MPI size.
        grid_size( mpi_size, &grid_p, &grid_q );
        if (mpi_rank == 0) {
            printf( "mpi_size %d, grid_p %d, grid_q %d\n",
                    mpi_size, grid_p, grid_q );
        }

        // so random_matrix is different on different ranks.
        srand( 100 * mpi_rank );

        if (types[ 0 ]) {
            test_gels_overdetermined < float >();
            test_gels_underdetermined< float >();
        }
        if (mpi_rank == 0)
            printf( "\n" );

        if (types[ 1 ]) {
            test_gels_overdetermined < double >();
            test_gels_underdetermined< double >();
        }
        if (mpi_rank == 0)
            printf( "\n" );

        if (types[ 2 ]) {
            test_gels_overdetermined < std::complex<float> >();
            test_gels_underdetermined< std::complex<float> >();
        }
        if (mpi_rank == 0)
            printf( "\n" );

        if (types[ 3 ]) {
            test_gels_overdetermined < std::complex<double> >();
            test_gels_underdetermined< std::complex<double> >();
        }

        slate_mpi_call(
            MPI_Finalize() );
    }
    catch (std::exception const& ex) {
        fprintf( stderr, "%s", ex.what() );
        return 1;
    }
    return 0;
}