Tutorial 8 - GPU Data Node¶

The GPU data node creates various attributes for use in a CUDA-based GPU compute. Several representative types are used, though the list of potential attribute types is not exhaustive. See Attribute Data Types for the full list.

This node also introduces the notion of attribute typedefs; a useful concept when passing data around in functions.

OgnTutorialCudaData.ogn¶

The ogn file shows the implementation of a node named “omni.graph.tutorials.CudaData”, which has inputs and outputs of various types to use in various computations. Three different CUDA methods are created to show how each of the types is passed through to the GPU and used by CUDA.

{
    "CudaData" : {
        "version": 1,
        "categories": "tutorials",
        "memoryType": "cuda",
        "description": [
            "This is a tutorial node. It performs different functions on the GPU to illustrate different types of",
            "data access. The first adds inputs 'a' and 'b' to yield output 'sum', all of which are on the GPU.",
            "The second is a sample expansion deformation that multiplies every point on a set of input points,",
            "stored on the GPU, by a constant value, stored on the CPU, to yield a set of output points, also on the GPU.",
            "The third is an assortment of different data types illustrating how different data is passed to the GPU.",
            "This particular node uses CUDA for its GPU computations, as indicated in the memory type value.",
            "Normal use case for GPU compute is large amounts of data. For testing purposes this node only handles",
            "a very small amount but the principle is the same."
        ],
        "metadata":
        {
            "uiName": "Tutorial Node: Attributes With CUDA Data"
        },
        "inputs": {
            "a": {
                "type": "float",
                "description": ["First value to be added in algorithm 1"],
                "default": 0.0
            },
            "b": {
                "type": "float",
                "description": ["Second value to be added in algorithm 1"],
                "default": 0.0
            },
            "points": {
                "type": "float[3][]",
                "description": ["Points to be moved by algorithm 2"],
                "default": []
            },
            "multiplier": {
                "type": "float[3]",
                "memoryType": "cpu",
                "description": ["Amplitude of the expansion for the input points in algorithm 2"],
                "default": [1.0, 1.0, 1.0]
            },
            "half": {
                "type": "half",
                "description": ["Input of type half for algorithm 3"],
                "default": 1.0
            },
            "color": {
                "type": "colord[3]",
                "description": ["Input with three doubles as a color for algorithm 3"],
                "default": [1.0, 0.5, 1.0]
            },
            "matrix": {
                "type": "matrixd[4]",
                "description": ["Input with 16 doubles interpreted as a double-precision 4d matrix"],
                "default": [[1.0,0.0,0.0,0.0], [0.0,1.0,0.0,0.0], [0.0,0.0,1.0,0.0], [0.0,0.0,0.0,1.0]]
            }
        },
        "outputs": {
            "sum": {
                "type": "float",
                "description": ["Sum of the two inputs from algorithm 1"]
            },
            "points": {
                "type": "float[3][]",
                "description": ["Final positions of points from algorithm 2"]
            },
            "half": {
                "type": "half",
                "description": ["Output of type half for algorithm 3"]
            },
            "color": {
                "type": "colord[3]",
                "description": ["Output with three doubles as a color for algorithm 3"]
            },
            "matrix": {
                "type": "matrixd[4]",
                "description": ["Output with 16 doubles interpreted as a double-precision 4d matrix"]
            }
        },
        "$why": "By putting inputs but no outputs in tests I can run the compute without failing on extraction",
        "tests": [
            {
                "inputs:a": 1.0, "inputs:b": 2.0,
                "inputs:half": 1.0,
                "inputs:color": [0.5, 0.6, 0.7],
                "inputs:matrix": [[1.0,2.0,3.0,4.0],[2.0,3.0,4.0,5.0],[3.0,4.0,5.0,6.0],[4.0,5.0,6.0,7.0]],
                "inputs:points": [[1.0, 2.0, 3.0], [2.0, 3.0, 4.0]],
                "inputs:multiplier": [2.0, 3.0, 4.0]
            }
        ],
        "$tests_disabled_until_gpu_data_extraction_works": [
            { "inputs:a": 1.0, "inputs:b": 2.0, "outputs:sum": 3.0 },
            { "inputs:a": 5.0, "inputs:b": 3.0, "outputs:sum": 8.0 },
            {
                "inputs:half": 1.0, "outputs:half": 2.0,
                "inputs:color": [0.5, 0.6, 0.7], "outputs:color": [0.5, 0.4, 0.3],
                "inputs:matrix": [[1.0,2.0,3.0,4.0],[2.0,3.0,4.0,5.0],[3.0,4.0,5.0,6.0],[4.0,5.0,6.0,7.0]],
                "outputs:matrix": [[30.0,40.0,50.0,60.0],[40.0,54.0,68.0,82.0],[50.0,68.0,86.0,104.0],[60.0,82.0,104.0,126.0]],
                "inputs:points": [[1.0, 2.0, 3.0], [2.0, 3.0, 4.0]],
                "inputs:multiplier": [2.0, 3.0, 4.0],
                "outputs:points": [[2.0, 6.0, 12.0], [4.0, 9.0, 16.0]]
            }
        ]
    }
}

OgnTutorialCudaData.cpp¶

The cpp file contains the implementation of the compute method, which in turn calls the three CUDA algorithms.

// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto.  Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
//
#include <omni/graph/core/GpuArray.h>

#include <OgnTutorialCudaDataDatabase.h>

// This function exercises referencing simple data types on the GPU, something you normally wouldn't do as
// there is no efficiency gain in doing that versus just passing in the CPU value.
extern "C" void applyAddGPU(outputs::sum_t sum, inputs::a_t a, inputs::b_t b);

// This function exercises referencing of array data on the GPU.
// The CUDA code takes its own "float3" data types, which are castable equivalents to the generated GfVec3f.
// The GpuArray/ConstGpuArray wrappers isolate the GPU pointers from the CPU code.
extern "C" void applyDeformationGPU(outputs::points_t outputPoints,
                                    inputs::points_t inputPoints,
                                    inputs::multiplier_t multiplier,
                                    size_t numberOfPoints);

// This function exercises referencing non-standard data types on the GPU to illustrate how data of different
// types are passed to the GPU.
extern "C" void applyDataTypes(outputs::half_t halfOutput,
                               outputs::color_t colorOutput,
                               outputs::matrix_t matrixOutput,
                               inputs::half_t halfInput,
                               inputs::color_t colorInput,
                               inputs::matrix_t matrixInput);

// This node runs a couple of algorithms on the GPU, while accessing parameters from the CPU
class OgnTutorialCudaData
{
public:
    static bool compute(OgnTutorialCudaDataDatabase& db)
    {
        // ================ algorithm 1 =========================================================
        // It's an important distinction here that GPU data is always returned as raw pointers since the CPU code
        // in the node cannot directly access it. The raw pointers are passed into the GPU code for dereferencing.
        applyAddGPU(db.outputs.sum(), db.inputs.a(), db.inputs.b());

        // ================ algorithm 2 =========================================================
        size_t numberOfPoints = db.inputs.points.size();
        db.outputs.points.resize(numberOfPoints);
        const auto& multiplier = db.inputs.multiplier();
        if (numberOfPoints > 0)
        {
            applyDeformationGPU(db.outputs.points(), db.inputs.points(), db.inputs.multiplier(), numberOfPoints);
        }

        // ================ algorithm 3 =========================================================
        applyDataTypes(db.outputs.half(), db.outputs.color(), db.outputs.matrix(),
                       db.inputs.half(), db.inputs.color(), db.inputs.matrix());
        return true;
    }
};

REGISTER_OGN_NODE()

OgnTutorialCudaData_CUDA.cu¶

The cu file contains the implementation of the algorithms on the GPU using CUDA.

// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto.  Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
//
#include <OgnTutorialCudaDataDatabase.h>

// ======================================================================
// CUDA compute implementation code will usually have two methods;
//     xxxGPU() - Host function to act as an intermediary between CPU and GPU code
//     xxxCUDA() - CUDA implementation of the actual node algorithm.

// ======================================================================
// Algorithm 1 support - add two values together.
// Note how the generated typedefs are used to make declarations easier.
// On the CUDA side the types are different than what you'd see on the CPU side.
// e.g. a CPU type of GfVec3f appears in CUDA as float3.
__global__ void applyAddCUDA(outputs::sum_t output, inputs::a_t a, inputs::b_t b)
{
    *output = *a + *b;
}
extern "C" void applyAddGPU(outputs::sum_t output, inputs::a_t a, inputs::b_t b)
{
    // Launch the GPU code - only a single thread and block is needed since this is a single set of data
    applyAddCUDA<<<1, 1>>>(output, a, b);
}

// ======================================================================
// Algorithm 2 support - apply the multiplier to "inputPoints" to yield "outputPoints"
__global__ void applyDeformationCUDA(
    outputs::points_t outputPoints,
    inputs::points_t inputPoints,
    inputs::multiplier_t multiplier,
    size_t numberOfPoints
)
{
    // Make sure the current evaluation block is in range of the available points
    int currentPointIndex = blockIdx.x * blockDim.x + threadIdx.x;
    if (numberOfPoints <= currentPointIndex) return;

    // Apply the multiplier to the current points
    (*outputPoints)[currentPointIndex].x = (*inputPoints)[currentPointIndex].x * multiplier.x;
    (*outputPoints)[currentPointIndex].y = (*inputPoints)[currentPointIndex].y * multiplier.y;
    (*outputPoints)[currentPointIndex].z = (*inputPoints)[currentPointIndex].z * multiplier.z;
}
// CPU interface called from OgnTutorialCudaData::compute, used to launch the GPU workers.
// "numberOfPoints" is technically redundant since it's equal to inputPoints.size(), however that call
// is only available on the __device__ (GPU) side and the value is needed in the calculation of the
// number of blocks required here on the __host__ (CPU) side so it's better to pass it in.
extern "C" void applyDeformationGPU(
    outputs::points_t outputPoints,
    inputs::points_t inputPoints,
    inputs::multiplier_t& multiplier, // Using a ref here gives a matching type for CPU-side data
    size_t numberOfPoints
)
{
    // Split the work into 256 threads, an arbitrary number that could be more precisely tuned when necessary
    const int numberOfThreads = 256;
    // Block size is the number of points that fit into each of the threads
    const int numberOfBlocks = (numberOfPoints + numberOfThreads - 1) / numberOfThreads;

    // Launch the GPU deformation using the calculated number of threads and blocks
    applyDeformationCUDA<<<numberOfBlocks, numberOfThreads>>>(outputPoints, inputPoints, multiplier, numberOfPoints);
}

// ======================================================================
// Algorithm 3 support - simple manipulation of an input to yield the similarly named output.
// Typedefs are used to make declaration simple - the actual CUDA types are in the comment beside the attribute name.
__global__ void applyDataTypesCUDA(
    outputs::half_t halfOutput,     // __half
    outputs::color_t colorOutput,   // double3
    outputs::matrix_t matrixOutput, // Matrix4d
    inputs::half_t  halfInput,      // const __half
    inputs::color_t colorInput,     // const double3
    inputs::matrix_t matrixInput    // const Matrix4d
)
{
    // half and color values are doubled
    *halfOutput = __hadd(*halfInput, *halfInput);
    *colorOutput = *colorInput + *colorInput;
    // matrix value is squared
    *matrixOutput = *matrixInput;
    *matrixOutput *= *matrixInput;
}
extern "C" void applyDataTypes(
    outputs::half_t halfOutput,
    outputs::color_t colorOutput,
    outputs::matrix_t matrixOutput,
    inputs::half_t  halfInput,
    inputs::color_t colorInput,
    inputs::matrix_t matrixInput
)
{
    // Launch the GPU code - only a single thread and block is needed since this is a single set of data
    applyDataTypesCUDA<<<1, 1>>>(halfOutput, colorOutput, matrixOutput, halfInput, colorInput, matrixInput);
}

GPU Attribute Access¶

Here is the set of generated attributes from the database. The attributes living on the GPU return pointers to memory as the CPU side cannot dereference it into its actual type (e.g. a float value, which would be returned as a float& on the CPU side is returned instead as a float* on the GPU side.)

In addition, when calling into CUDA code the data changes type as it crosses the GPU boundary. On the CUDA side it uses the CUDA native data types when it can, which are bytewise compatible with their CPU counterparts. Note that in the case of the CPU attribute multiplier the data is passed to the CUDA code by value, since it has to be copied from CPU to GPU.

Database Function	Is GPU?	CPU Type	CUDA Type
inputs.a()	Yes	const float*	const float*
inputs.b()	Yes	const float*	const float*
outputs.sum()	Yes	float*	float*
inputs.half()	Yes	const pxr::GfHalf*	__half*
outputs.half()	Yes	pxr::GfHalf*	__half*
inputs.color()	Yes	const GfVec3d*	const double3*
outputs.color()	Yes	GfVec3d*	double3*
inputs.matrix()	Yes	const GfMatrix4d*	const Matrix4d*
outputs.matrix()	Yes	GfMatrix4d*	Matrix4d*
inputs.multiplier()	No	const GfVec3f&	const float3
inputs.points()	Yes	const GfVec3f*	const float3**
outputs.points()	Yes	GfVec3f*	float3**

The array attribute points does not have an array-like wrapper as the CUDA code would rather deal with raw pointers. In order to provide the size information, when calling the CUDA code the value inputs.points.size() should also be passed in.

Notice the subtle difference in types on the CPU side for GPU-based data. Instead of references to data there are pointers, necessary since the data lives in a different memory-space, and all pointers have an extra level of indirection for the same reason.

There is also a section of this generated file dedicated to information relevant to the CUDA code. In this section the CUDA attribute data types are defined. It is protected with #ifdef __CUDACC__ so that it is only processed when included through the CUDA compiler (and vice versa, so none of the other setup code will be processed on the CUDA side).

#include <cuda_fp16.h>
#include <omni/graph/core/cuda/CUDAUtils.h>
#include <omni/graph/core/cuda/Matrix4d.h>
namespace OgnTutorialCudaDataCudaTypes
{
namespace inputs
{
using a_t = const float*;
using b_t = const float*;
using points_t = const float3**;
using multiplier_t = const float3*;
using half_t = const __half*;
using color_t = const double3*;
using matrix_t = const Matrix4d*;
}
namespace outputs
{
using sum_t = float*;
using points_t = float3**;
using half_t = __half*;
using color_t = double3*;
using matrix_t = Matrix4d*;
}
}
using namespace OgnTutorialCudaDataCudaTypes;

Notice the inclusion of the file cuda_fp16.h, needed due to the use of the CUDA type __half, and the files omni/graph/core/cuda/CUDAUtils.h and omni/graph/core/cuda/Matrix4d.h, which provide support functions for CUDA math.

The data types used by CUDA are compatible with their equivalents on the C++ side, so you can specify passing arguments into CUDA from C++ by using a declaration such as this on the C++ side:

// In this code "inputs::points_t" is the type "GfVec3f*".
// The size of that array must be passed in as well since it is not implicit in the data type.
extern "C" void cudaCompute(
    inputs::points_t*,
    size_t pointSize,
    inputs::multiplier_t*,
    outputs::points_t*
);

which corresponds to this function defined in the .cu file:

// In this code "inputs::points_t" is the type "float3*"
extern "C" void cudaCompute(
    inputs::points_t* inPoints,
    size_t pointSize,
    inputs::multiplier_t* multiplier,
    outputs::points_t* outPoints
)
{...}

Pointers are used in the calls rather than being part of the type definitions in order to emphasize the fact that the values passed through are pointers to the real data in the flatcache, which in this case is data in GPU memory.