Example 13: Non-uniform Block Sizes

This example demonstrates creating a matrix with non-uniform tile sizes.

Key Concepts

Lambda Constructors: Using lambda functions to define tile properties (tileNb, tileRank, tileDevice) instead of fixed values.
Custom Block Sizes: Defining a function that returns the block size for a given block index \(j\).
Process/Device Mapping: Using helper functions like slate::func::process_2d_grid and slate::func::device_1d_grid to define distribution.

C++ Example

Custom Block Size Function (Lines 30-34)

std::function< int64_t (int64_t j) >
tileNb = [n, nb_](int64_t j)
{
    return (j % 2 != 0 ? nb_/2 : nb_);
};

Instead of passing a constant nb, we define a lambda function tileNb. - Input: Block index j (0, 1, 2, …). - Output: The number of columns in block column j. - This example alternates block sizes between nb and nb/2.

Distribution Functions (Lines 36-38)

auto tileRank = slate::func::process_2d_grid( slate::GridOrder::Col, p_, q_ );
auto tileDevice = slate::func::device_1d_grid( slate::GridOrder::Col, p_, num_devices_ );

We can also customize how tiles map to MPI ranks and devices. Here we use standard helper functions from slate::func, but you could write your own lambdas (e.g., to force a specific block cyclic pattern or a custom distribution).

Matrix Construction (Line 40)

slate::Matrix<scalar_type> A( n, n, tileNb, tileNb, tileRank, tileDevice, MPI_COMM_WORLD );

The constructor takes these functions as arguments. - tileNb is passed twice: once for row heights tileMb and once for column widths tileNb. This creates a square-tiled matrix (though the tiles themselves can vary in size). - tileRank determines the MPI rank for tile (i, j). - tileDevice determines the GPU device ID for tile (i, j).

Verification (Lines 53-59)

The code iterates through the matrix to verify that the block sizes match the logic defined in the lambda. Note that A.tileNb(j) handles the boundary condition at the end of the matrix automatically (clamping to n).

// ex13_non_uniform_block_size.cc
// create 1000 x 1000 matrix on 2 x 2 MPI process grid, with non-uniform tile size

/// !!!   Lines between `//---------- begin label`          !!!
/// !!!             and `//---------- end label`            !!!
/// !!!   are included in the SLATE Users' Guide.           !!!

#include <slate/slate.hh>

#include "util.hh"

int mpi_size = 0;
int mpi_rank = 0;
int grid_p = 0;
int grid_q = 0;

//------------------------------------------------------------------------------
template <typename scalar_type>
void test_matrix_lambda()
{
    print_func( mpi_rank );

    int64_t n=1000, nb=256;

    int nb_ = nb;
    int p_ = grid_p;
    int q_ = grid_q;
    int num_devices_ = 0;

    std::function< int64_t (int64_t j) >
    tileNb = [n, nb_](int64_t j)
    {
        return (j % 2 != 0 ? nb_/2 : nb_);
    };

    auto tileRank = slate::func::process_2d_grid( slate::GridOrder::Col, p_, q_ );
    auto tileDevice = slate::func::device_1d_grid( slate::GridOrder::Col,
                                                   p_, num_devices_ );

    slate::Matrix<scalar_type> A( n, n, tileNb, tileNb, tileRank, tileDevice, MPI_COMM_WORLD );
    A.insertLocalTiles();

    for (int64_t j = 0; j < A.nt(); ++j) {
        for (int64_t i = 0; i < A.mt(); ++i) {
            if (A.tileIsLocal( i, j )) {
                slate::Tile<scalar_type> T = A( i, j );
                random_matrix( T.mb(), T.nb(), T.data(), T.stride() );
            }
        }
    }

    // verify nt, tileNb(i), and sum tileNb(i) == n
    int nt = A.nt();
    int jj = 0;
    for (int j = 0; j < nt; ++j) {
        assert( A.tileNb(j) == blas::min( tileNb(j), n - jj ) );
        jj += A.tileNb( j );
    }
    assert( jj == n );
}

//------------------------------------------------------------------------------
int main( int argc, char** argv )
{
    try {
        // Parse command line to set types for s, d, c, z precisions.
        bool types[ 4 ];
        parse_args( argc, argv, types );

        int provided = 0;
        slate_mpi_call(
            MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &provided ) );
        assert( provided == MPI_THREAD_MULTIPLE );

        slate_mpi_call(
            MPI_Comm_size( MPI_COMM_WORLD, &mpi_size ) );

        slate_mpi_call(
            MPI_Comm_rank( MPI_COMM_WORLD, &mpi_rank ) );

        // Determine p-by-q grid for this MPI size.
        grid_size( mpi_size, &grid_p, &grid_q );
        if (mpi_rank == 0) {
            printf( "mpi_size %d, grid_p %d, grid_q %d\n",
                    mpi_size, grid_p, grid_q );
        }

        // so random_matrix is different on different ranks.
        srand( 100 * mpi_rank );

        if (types[ 0 ]) {
            test_matrix_lambda< float >();
        }

        if (types[ 1 ]) {
            test_matrix_lambda< double >();
        }

        if (types[ 2 ]) {
            test_matrix_lambda< std::complex<float> >();
        }

        if (types[ 3 ]) {
            test_matrix_lambda< std::complex<double> >();
        }

        slate_mpi_call(
            MPI_Finalize() );
    }
    catch (std::exception const& ex) {
        fprintf( stderr, "%s", ex.what() );
        return 1;
    }
    return 0;
}