KokkosBatched::Copy¶
Defined in header: KokkosBatched_Copy_Decl.hpp
template <typename ArgTrans = Trans::NoTranspose, int rank = 2>
struct SerialCopy {
template <typename AViewType, typename BViewType>
KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, const BViewType &B);
};
template <typename MemberType, typename ArgTrans = Trans::NoTranspose, int rank = 2>
struct TeamCopy {
template <typename AViewType, typename BViewType>
KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
const AViewType &A,
const BViewType &B);
};
template <typename MemberType, typename ArgTrans = Trans::NoTranspose, int rank = 2>
struct TeamVectorCopy {
template <typename AViewType, typename BViewType>
KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
const AViewType &A,
const BViewType &B);
};
template <typename MemberType, typename ArgTrans, typename ArgMode, int rank = 2>
struct Copy {
template <typename AViewType, typename BViewType>
KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member,
const AViewType &A,
const BViewType &B);
};
Performs batched matrix or vector copying from source to destination. For each pair of matrices or vectors in the batch, copies:
where:
\(\text{op}(A)\) can be \(A\) or \(A^T\) (transpose)
\(A\) is the source matrix or vector
\(B\) is the destination matrix or vector
This operation supports both rank-1 (vector) and rank-2 (matrix) views, controlled by the rank
template parameter.
Parameters¶
- member:
Team execution policy instance (not used in Serial mode)
- A:
Input view containing source matrices or vectors
- B:
Output view for destination matrices or vectors
Type Requirements¶
MemberType
must be a Kokkos TeamPolicy member typeArgTrans
must be one of:Trans::NoTranspose
- copy A to B (default)Trans::Transpose
- copy transpose of A to B
ArgMode
must be one of:Mode::Serial
- for serial executionMode::Team
- for team-based executionMode::TeamVector
- for team-vector execution
rank
must be either 1 (for vectors) or 2 (for matrices, default)AViewType
andBViewType
must be Kokkos Views with compatible dimensions:For rank=1: A(n) → B(n) for NoTranspose
For rank=2: A(m,n) → B(m,n) for NoTranspose, or A(m,n) → B(n,m) for Transpose
Example¶
#include <Kokkos_Core.hpp>
#include <KokkosBatched_Copy_Decl.hpp>
using execution_space = Kokkos::DefaultExecutionSpace;
using memory_space = execution_space::memory_space;
using device_type = Kokkos::Device<execution_space, memory_space>;
// Scalar type to use
using scalar_type = double;
int main(int argc, char* argv[]) {
Kokkos::initialize(argc, argv);
{
// Define matrix dimensions
int batch_size = 1000; // Number of matrices in batch
int m = 8; // Rows
int n = 6; // Columns
// Create views for batched matrices
Kokkos::View<scalar_type***, Kokkos::LayoutRight, device_type>
A("A", batch_size, m, n), // Source matrices
B("B", batch_size, m, n), // Destination for direct copy
C("C", batch_size, n, m); // Destination for transposed copy
// Initialize source matrices
Kokkos::RangePolicy<execution_space> policy(0, batch_size);
Kokkos::parallel_for("init_matrices", policy, KOKKOS_LAMBDA(const int i) {
// Initialize the i-th source matrix with index-based values
for (int row = 0; row < m; ++row) {
for (int col = 0; col < n; ++col) {
A(i, row, col) = 10.0 * row + col + 1.0;
}
}
// Initialize destination matrices to zero
for (int row = 0; row < m; ++row) {
for (int col = 0; col < n; ++col) {
B(i, row, col) = 0.0;
}
}
for (int row = 0; row < n; ++row) {
for (int col = 0; col < m; ++col) {
C(i, row, col) = 0.0;
}
}
});
Kokkos::fence();
// Perform batched direct copy using TeamPolicy
using team_policy_type = Kokkos::TeamPolicy<execution_space>;
team_policy_type policy_team(batch_size, Kokkos::AUTO);
Kokkos::parallel_for("batched_direct_copy", policy_team,
KOKKOS_LAMBDA(const typename team_policy_type::member_type& member) {
// Get batch index from team rank
const int i = member.league_rank();
// Extract batch slices
auto A_i = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL());
auto B_i = Kokkos::subview(B, i, Kokkos::ALL(), Kokkos::ALL());
// Perform direct copy (A → B)
KokkosBatched::TeamCopy<
typename team_policy_type::member_type, // MemberType
KokkosBatched::Trans::NoTranspose, // ArgTrans
2 // rank
>::invoke(member, A_i, B_i);
}
);
Kokkos::fence();
// Perform batched transposed copy using TeamVectorPolicy
team_policy_type policy_team_vector(batch_size, Kokkos::AUTO, Kokkos::AUTO);
Kokkos::parallel_for("batched_transpose_copy", policy_team_vector,
KOKKOS_LAMBDA(const typename team_policy_type::member_type& member) {
// Get batch index from team rank
const int i = member.league_rank();
// Extract batch slices
auto A_i = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL());
auto C_i = Kokkos::subview(C, i, Kokkos::ALL(), Kokkos::ALL());
// Perform transposed copy (A^T → C)
KokkosBatched::TeamVectorCopy<
typename team_policy_type::member_type, // MemberType
KokkosBatched::Trans::Transpose, // ArgTrans
2 // rank
>::invoke(member, A_i, C_i);
}
);
Kokkos::fence();
// Copy results to host for verification
auto A_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL()));
auto B_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
Kokkos::subview(B, 0, Kokkos::ALL(), Kokkos::ALL()));
auto C_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
Kokkos::subview(C, 0, Kokkos::ALL(), Kokkos::ALL()));
// Verify the direct copy (A → B)
printf("Verifying direct copy (first few elements):\n");
for (int row = 0; row < std::min(3, m); ++row) {
for (int col = 0; col < std::min(3, n); ++col) {
printf(" A(%d,%d) = %.1f, B(%d,%d) = %.1f\n",
row, col, A_host(row, col), row, col, B_host(row, col));
// Check for errors
if (std::abs(A_host(row, col) - B_host(row, col)) > 1e-10) {
printf(" ERROR: Direct copy mismatch at (%d,%d)\n", row, col);
}
}
}
// Verify the transposed copy (A^T → C)
printf("\nVerifying transposed copy (first few elements):\n");
for (int row = 0; row < std::min(3, n); ++row) {
for (int col = 0; col < std::min(3, m); ++col) {
printf(" A(%d,%d) = %.1f, C(%d,%d) = %.1f\n",
col, row, A_host(col, row), row, col, C_host(row, col));
// Check for errors
if (std::abs(A_host(col, row) - C_host(row, col)) > 1e-10) {
printf(" ERROR: Transposed copy mismatch at A(%d,%d) vs C(%d,%d)\n",
col, row, row, col);
}
}
}
// Demonstrate vector copying
int vec_length = 10;
// Create views for batched vectors
Kokkos::View<scalar_type**, Kokkos::LayoutRight, device_type>
X("X", batch_size, vec_length), // Source vectors
Y("Y", batch_size, vec_length); // Destination vectors
// Initialize source vectors
Kokkos::parallel_for("init_vectors", policy, KOKKOS_LAMBDA(const int i) {
for (int j = 0; j < vec_length; ++j) {
X(i, j) = j + 1.0;
Y(i, j) = 0.0;
}
});
Kokkos::fence();
// Perform batched vector copy using SerialCopy inside a parallel_for
Kokkos::parallel_for("batched_vector_copy", policy, KOKKOS_LAMBDA(const int i) {
// Extract batch slices
auto X_i = Kokkos::subview(X, i, Kokkos::ALL());
auto Y_i = Kokkos::subview(Y, i, Kokkos::ALL());
// Perform vector copy (X → Y)
KokkosBatched::SerialCopy<
KokkosBatched::Trans::NoTranspose, // ArgTrans
1 // rank = 1 for vectors
>::invoke(X_i, Y_i);
});
Kokkos::fence();
// Copy vector results to host for verification
auto X_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
Kokkos::subview(X, 0, Kokkos::ALL()));
auto Y_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
Kokkos::subview(Y, 0, Kokkos::ALL()));
// Verify the vector copy
printf("\nVerifying vector copy (first few elements):\n");
for (int j = 0; j < std::min(5, vec_length); ++j) {
printf(" X(%d) = %.1f, Y(%d) = %.1f\n", j, X_host(j), j, Y_host(j));
// Check for errors
if (std::abs(X_host(j) - Y_host(j)) > 1e-10) {
printf(" ERROR: Vector copy mismatch at element %d\n", j);
}
}
}
Kokkos::finalize();
return 0;
}