Tutorial 27 - GPU Data Node with CPU Array Pointers

The GPU data node illustrates the alternative method of extracting array data from the GPU by returning a CPU pointer to the GPU array. Normally the data returns a GPU pointer to an array of GPU pointers, optimized for future use in parallel processing of GPU array data. By returning a CPU pointer to the array you can use host-side processing to dereference the pointers.

OgnTutorialCudaDataCpu.ogn

The ogn file shows the implementation of a node named “omni.graph.tutorials.CudaCpuArrays”, which has an input and an output of type float[3][], along with the special keyword to indicate that the pointer to the CUDA arrays should be in CPU space.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
{
    "CudaCpuArrays" : {
        "version": 1,
        "memoryType": "cuda",
        "cudaPointers": "cpu",
        "description": [
            "This is a tutorial node. It illustrates the alternative method of extracting pointers to GPU array data",
            "in which the pointer returned is a CPU pointer and can be dereferenced on the CPU side. Without the",
            "cudaPointers value set that pointer would be a GPU pointer to an array of GPU pointers and could",
            "only be dereferenced on the device."
        ],
        "metadata":
        {
            "uiName": "Tutorial Node: Attributes With CUDA Array Pointers In Cpu Memory"
        },
        "categories": "tutorials",
        "inputs": {
            "points": {
                "type": "float[3][]",
                "memoryType": "any",
                "description": ["Array of points to be moved"],
                "default": []
            },
            "multiplier": {
                "type": "float[3]",
                "description": ["Amplitude of the expansion for the input points"],
                "default": [1.0, 1.0, 1.0]
            }
        },
        "outputs": {
            "points": {
                "type": "float[3][]",
                "description": ["Final positions of points"]
            }
        },
        "tests": [
            {
                "inputs:multiplier": [1.0, 2.0, 3.0],
                "inputs:points": [[1.0, 1.0, 1.0], [2.0, 2.0, 2.0], [3.0, 3.0, 3.0]],
                "outputs:points": [[1.0, 2.0, 3.0], [2.0, 4.0, 6.0], [3.0, 6.0, 9.0]]
            }
        ]
    }
}

OgnTutorialCudaDataCpu.cpp

The cpp file contains the implementation of the compute method, which in turn calls the CUDA algorithm.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto.  Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
//
#include <omni/graph/core/GpuArray.h>

#include <OgnTutorialCudaDataCpuDatabase.h>

// This function exercises referencing of array data on the GPU.
// The CUDA code takes its own "float3" data types, which are castable equivalents to the generated GfVec3f.
// The GpuArray/ConstGpuArray wrappers isolate the GPU pointers from the CPU code.
extern "C" void applyDeformationCpuToGpu(pxr::GfVec3f* outputPoints,
                                         const pxr::GfVec3f* inputPoints,
                                         const pxr::GfVec3f* multiplier,
                                         size_t numberOfPoints);

// This node runs a couple of algorithms on the GPU, while accessing parameters from the CPU
class OgnTutorialCudaDataCpu
{
public:
    static bool compute(OgnTutorialCudaDataCpuDatabase& db)
    {
        size_t numberOfPoints = db.inputs.points.size();
        db.outputs.points.resize(numberOfPoints);
        if (numberOfPoints > 0)
        {
            // The main point to note here is how the pointer can be dereferenced on the CPU side, whereas normally
            // you would have to send it to the GPU for dereferencing. (The long term purpose of the latter is to make
            // it more efficient to handle arrays-of-arrays on the GPU, however since that is not yet implemented
            // we can get away with a single dereference here.)
            applyDeformationCpuToGpu(*db.outputs.points(), *db.inputs.points.gpu(), db.inputs.multiplier(), numberOfPoints);

            // Just as a test now also reference the points as CPU data to ensure the value casts correctly
            const float* pointsAsCpu = reinterpret_cast<const float*>(db.inputs.points.cpu().data());
        }

        return true;
    }
};

REGISTER_OGN_NODE()

OgnTutorialCudaDataCpu_CUDA.cu

The cu file contains the implementation of the algorithm on the GPU using CUDA.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
// Copyright (c) 2020-2021, NVIDIA CORPORATION.  All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto.  Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
//
#include <OgnTutorialCudaDataCpuDatabase.h>

// ======================================================================
// Apply the multiplier to "inputPoints" to yield "outputPoints"
__global__ void applyDeformationCUDA(
    float3* outputPoints,
    const float3* inputPoints,
    const float3* multiplier,
    size_t numberOfPoints
)
{
    // Make sure the current evaluation block is in range of the available points
    int currentPointIndex = blockIdx.x * blockDim.x + threadIdx.x;
    if (numberOfPoints <= currentPointIndex) return;

    // Apply the multiplier to the current points
    outputPoints[currentPointIndex].x = inputPoints[currentPointIndex].x * multiplier->x;
    outputPoints[currentPointIndex].y = inputPoints[currentPointIndex].y * multiplier->y;
    outputPoints[currentPointIndex].z = inputPoints[currentPointIndex].z * multiplier->z;
}
// CPU interface called from OgnTutorialCudaDataCpu::compute, used to launch the GPU workers.
// "numberOfPoints" is technically redundant since it's equal to inputPoints.size(), however that call
// is only available on the __device__ (GPU) side and the value is needed in the calculation of the
// number of blocks required here on the __host__ (CPU) side so it's better to pass it in.
extern "C" void applyDeformationCpuToGpu(
    float3* outputPoints,
    const float3* inputPoints,
    const float3* multiplier,
    size_t numberOfPoints
)
{
    // Split the work into 256 threads, an arbitrary number that could be more precisely tuned when necessary
    const int numberOfThreads = 256;
    // Block size is the number of points that fit into each of the threads
    const int numberOfBlocks = (numberOfPoints + numberOfThreads - 1) / numberOfThreads;

    // Launch the GPU deformation using the calculated number of threads and blocks
    applyDeformationCUDA<<<numberOfBlocks, numberOfThreads>>>(outputPoints, inputPoints, multiplier, numberOfPoints);
}

OgnTutorialCudaDataCpuPy.py

The py file contains the implementation of the compute method, which for this example doesn’t actually compute as extra extension support is required for Python to run on the GPU (e.g. a Python -> CUDA compiler).

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
"""
Implementation of the Python node accessing CUDA attributes in a way that accesses the GPU arrays with a CPU pointer.

No actual computation is done here as the tutorial nodes are not set up to handle GPU computation.
"""
import ctypes

import omni.graph.core as og

# Only one type of data is handled by the compute - pointf[3][]
POINT_ARRAY_TYPE = og.Type(og.BaseDataType.FLOAT, tuple_count=3, array_depth=1, role=og.AttributeRole.POSITION)


def get_address(attr: og.Attribute) -> int:
    """Returns the contents of the memory the attribute points to"""
    ptr_type = ctypes.POINTER(ctypes.c_size_t)
    ptr = ctypes.cast(attr.memory, ptr_type)
    return ptr.contents.value


class OgnTutorialCudaDataCpuPy:
    """Exercise GPU access for extended attributes through a Python OmniGraph node"""

    @staticmethod
    def compute(db) -> bool:
        """Accesses the CUDA data, which for arrays exists as wrappers around CPU memory pointers to GPU pointer arrays.
        No compute is done here.
        """

        # Put accessors into local variables for convenience
        input_points = db.inputs.points
        multiplier = db.inputs.multiplier

        # Set the size to what is required for the multiplication - this can be done without accessing GPU data.
        # Notice that since this is CPU pointers to GPU data the size has to be taken from the data type description
        # rather than the usual method of taking len(input_points).
        db.outputs.points_size = input_points.dtype.size

        # After changing the size the memory isn't allocated immediately (when necessary). It is delayed until you
        # request access to it, which is what this line will do.
        output_points = db.outputs.points

        # This is a separate test to add a points attribute to the output bundle to show how when a bundle has
        # CPU pointers to the GPU data that information propagates to its children
        # Start with an empty output bundle.
        output_bundle = db.outputs.outBundle
        output_bundle.clear()
        output_bundle.add_attributes([og.Type(og.BaseDataType.FLOAT, 3, 1)], ["points"])
        bundle_attr = output_bundle.attribute_by_name("points")
        # As for the main attributes, setting the bundle member size readies the buffer of the given size on the GPU
        bundle_attr.size = input_points.dtype.size

        # The output cannot be written to here through the normal assignment mechanisms, e.g. the typical step of
        # copying input points to the output points, as the data is not accessible on the GPU through Python directly.
        # Instead you can access the GPU memory pointers through the attribute values and send it to CUDA code, either
        # generated from the Python code or accessed through something like pybind wrappers.

        print("Locations in CUDA() should be in GPU memory space")
        print(f"    CPU Location for reference = {hex(id(db))}", flush=True)
        print(f"    Input points are {input_points} at CUDA({hex(get_address(input_points))})", flush=True)
        print(f"    Multiplier is CUDA({multiplier})", flush=True)
        print(f"    Output points are {output_points} at CUDA({hex(get_address(output_points))})", flush=True)
        print(f"    Bundle {bundle_attr.gpu_value} at CUDA({hex(get_address(bundle_attr.gpu_value))})", flush=True)

        return True