Skip to content

Tensor Implementation

Yutao Xu edited this page Aug 14, 2025 · 8 revisions

First, let's dive deeper into the implementation of the Tensor structure. In PyTorch this structure is defined here: https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/templates/TensorBody.h#L92.

TensorBase

class Tensor : public TensorBase {};

Using TensorBase as the base class for Tensor can improve compilation efficiency, as any extension to the Tensor does not require recompiling the TensorBase.

class TensorBase {
	intrusive_ptr<TensorImpl> impl_;
  intrusive_ptr<GradFunction> grad_fn_;
};

TensorImpl

TensorImpl is an intrusive pointer target, which means that Tensor is just a reference. TensorImpl contains a reference counter, which will automatically release the memory when the count reaches zero. We can roughly define the details as below:

class TensorImpl : public intrusive_ptr_target {
	  int dim_;
    dim_t shape_;
    dim_t stride_;
    ScalarType dtype_;
    int64_t numel_;
    intrusive_ptr<TensorStorage> storage_;
    int64_t storage_offset_ = 0;
    bool is_contiguous_ = true;
    bool requires_grad_ = false;
    std::unique_ptr<TensorBase, TensorDeleter> grad_;
    /* ... */
};

Storage

So, what is TensorStorage ? First, it's still an intrusive pointer target. Then, it contains a DataPtr class, which allocated on the GPU using cuMalloc and freed with cuFree when its lifecycle ends:

class TensorStorage : public intrusive_ptr_target {
    size_t size_;
    int device_;
    DataPtr ptr_;
public:
    TensorStorage(size_t size, int device) :
        size_(size), device_(device) {
        ptr_ = DeviceAllocator::GetInstance()->allocate(size, device);
    }
};

class DataPtr {
    void *data_;                              // Lifetime tied to ctx_
    std::unique_ptr<void, DeleterFnPtr> ctx_; // For automatic release
public:
    DataPtr(void *data, void *ctx, DeleterFnPtr ctx_deleter) :
        data_(data), ctx_(ctx, ctx_deleter ? ctx_deleter : &delete_nothing) {
    }
};

CachingAllocator

The CachingAllocator is used to allocate GPU memory. Since it is caching-based, freeing a buffer usually does not actually release it, instead, the buffer is placed into a free list. If the size of a future allocation request is less than or equal to (and not much smaller than) that buffer, the allocator will reuse it. Such memory blocks typically meet certain alignment requirements.

class DeviceAllocator {
    std::set<Block *, CompareBlock> unused_blocks_[POOL_SIZE];
    std::unordered_set<Block *> active_blocks_;
    std::unordered_map<void *, Block *> ptr_to_block_;
    DataPtr allocate(size_t size_in_bytes, int device, int stream = 0);
};

DataPtr DeviceAllocator::allocate(const size_t size, int device, int stream) {
    set_device(device);
    size_t aligned_size = (size + ALIGNMENT - 1) / ALIGNMENT * ALIGNMENT;
    auto &pool = unused_blocks_[_find_pool_index(size)];
    auto search_key = Block(size, stream);
    auto it = pool.lower_bound(&search_key);
    if (it == pool.end()) {
        auto raw_ptr = dmalloc(aligned_size);
        auto new_block_ptr = new Block();
        new_block_ptr->ptr = raw_ptr;
        new_block_ptr->size = aligned_size;
        new_block_ptr->device = device;
        new_block_ptr->stream = stream;
        new_block_ptr->in_use = true;
        ptr_to_block_[raw_ptr] = new_block_ptr;
        bool inserted = active_blocks_.insert(new_block_ptr).second;
        CHECK_FAIL(inserted);
        return {new_block_ptr->ptr, new_block_ptr->ptr, delete_impl};
    } else {
        (*it)->in_use = true;
        pool.erase(it);
        bool inserted = active_blocks_.insert((*it)).second;
        CHECK_FAIL(inserted);
        return {(*it)->ptr, (*it)->ptr, delete_impl};
    }
    return {nullptr, nullptr, delete_nothing};
}

void DeviceAllocator::free(void *ptr) {
    Block *block_ptr = ptr_to_block_[ptr];
    block_ptr->in_use = false;
    active_blocks_.erase(block_ptr);
    auto size = block_ptr->size;
    bool inserted = unused_blocks_[_find_pool_index(size)].insert(block_ptr).second;
    CHECK_FAIL(inserted);
}

Create a Tensor View

We can change the way a tensor is interpreted without modifying its storage by only adjusting its stride, size, or offset. This approach is typically used when calling t.permute(), t.view(), or t.transpose().

Tensor TensorImpl::permute(const std::vector<int64_t> dims) const {
    const auto ndim = dim_;
    CHECK_FAIL(ndim == dims.size());
    auto new_sizes = std::vector<int64_t>(ndim);
    auto new_strides = std::vector<int64_t>(ndim);
    std::vector<bool> seen_dims(ndim);
    for (int i = 0; i < ndim; i++) {
        int d = maybe_wrap_dim(dims[i], ndim);
        CHECK_FAIL(!seen_dims[d], "permute(): duplicate dims are not allowed.");
        seen_dims[d] = true;
        new_sizes[i] = this->shape_[d];
        new_strides[i] = this->stride_[d];
    }
    return as_strided(new_sizes, new_strides);
}

void TensorImpl::as_strided_(std::vector<int64_t> sizes, std::vector<int64_t> strides, int64_t storage_offset) {
    auto ndim = sizes.size();
    // in-bounds check
    auto [min_offset, max_offset] = compute_offset_range(sizes, strides, ndim);
    min_offset += storage_offset;
    max_offset += storage_offset;
    CHECK_FAIL(min_offset >= 0);
    CHECK_FAIL(max_offset * element_size(dtype_) < this->storage_bytes());
    // create tensor view
    dim_ = ndim;
    int64_t numel = 1;
    for (int i = 0; i < ndim; i++) {
        shape_[i] = sizes[i];
        numel *= sizes[i];
        stride_[i] = strides[i];
    }
    numel_ = numel;
    is_contiguous_ = false;
    storage_offset_ = storage_offset;
}

GradFunction

And what is GradFunction ? We provide its definition, and it's easy to see that:

  1. It requires the definition of backward function.
  2. It references the input TensorImpl to prevent them from being freed during the backward computation.
class GradFunction : public intrusive_ptr_target {
public:
    virtual std::vector<Tensor> backward(Tensor grad_output) = 0;
    std::vector<Tensor> inputs;
};

class AddGradFunction : public GradFunction {
public:
    std::vector<Tensor> backward(Tensor grad_output) override {
        std::vector<Tensor> grad_inputs;
        grad_inputs.resize(2);
        if (inputs[0].requires_grad()) {
            grad_inputs[0] = grad_output;
        }
        if (inputs[1].requires_grad()) {
            grad_inputs[1] = grad_output;
        }
        return grad_inputs;
    }
    AddGradFunction(const Tensor &left, const Tensor &right) {
        inputs.push_back(left);
        inputs.push_back(right);
    }
};

Note: PyTorch uses Node as the node of its autograd, and maintains an Edge table to represent the graph. For simplicity, we can use the following algorithm to perform backpropagation:

void Tensor::backward(Tensor grad_output) {
    std::queue<Tensor *> ready;
    std::unordered_map<TensorImpl *, int> needed;
    std::unordered_map<TensorImpl *, Tensor> grad_acc;
    // build needed counts
    ready.push(this);
    while (!ready.empty()) {
        auto t = ready.front();
        ready.pop();
        if (t->has_grad_fn()) {
            for (auto &input : t->grad_fn_.get()->inputs) {
                if (!input.requires_grad()) continue;
                needed[input.impl()] += 1;
                ready.push(&input);
            }
        }
    }
    // calculate gradients
    grad_acc[this->impl()] = grad_output;
    ready.push(this);
    while (!ready.empty()) {
        auto t = ready.front();
        ready.pop();
        auto go = grad_acc[t->impl()];
        if (t->has_grad_fn()) {
            auto grad_fn = t->grad_fn_.get();
            auto gis = grad_fn->backward(go);
            auto &inputs = grad_fn->inputs;
            for (int i = 0; i < inputs.size(); ++i) {
                if (!inputs[i].requires_grad()) continue;
                auto &acc = grad_acc[inputs[i].impl()];
                acc = acc.defined() ? (acc + gis[i]) : gis[i];
                if (--needed[inputs[i].impl()] == 0) {
                    ready.push(&inputs[i]);
                }
            }
        } else if (t->requires_grad()) {
            t->update_grad(go);
        }
    }
}

void Tensor::update_grad(Tensor grad) {
    auto impl = this->impl_.get();
    if (impl->grad_.get()) {
        *impl->grad_ += grad;
    } else {
        Tensor grad_ = empty_like(grad);
        grad_.copy_(grad);
        impl->grad_ = std::unique_ptr<Tensor, TensorDeleter>(new Tensor(grad_), [](Tensor *t) { delete t; });
    }
}

Integrate Binary Add Autograd

After finishing the autograd engine, adding hooks for binary add function becomes very intuitive.

Tensor add(const Tensor &left, const Tensor &right) {
    Tensor out;
    out = add_out(out, left, right);
    out.set_requires_grad((left.requires_grad() || right.requires_grad()));
    if (out.requires_grad()) {
        out.set_grad_fn(new AddGradFunction(left, right));
    }
    return out;
}

Clone this wiki locally