// SPDX-FileCopyrightText: 2026 Alex Bates <alex@bates64.com>
//
// SPDX-License-Identifier: AGPL-3.0-or-later

/// C bridge implementation for parallel-rdp's Granite Vulkan context and RDP
/// command processor.

#include "bridge.hpp"
#include "context.hpp"
#include "device.hpp"
#include "logging.hpp"
#include "rdp_device.hpp"

#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <memory>
#include <vector>

using namespace Vulkan;

// -- Logging --

static void (*s_log_callback)(uint32_t level, const char *msg) = nullptr;

/// Routes Granite log messages to a Rust callback.
class RdpLoggingInterface final : public Util::LoggingInterface {
public:
    bool log(const char *tag, const char *fmt, va_list va) override
    {
        if (!s_log_callback)
            return false;

        uint32_t level;
        if (strncmp(tag, "[ERROR]", 7) == 0)
            level = RDP_LOG_LEVEL_ERROR;
        else if (strncmp(tag, "[WARN]", 6) == 0)
            level = RDP_LOG_LEVEL_WARN;
        else
            level = RDP_LOG_LEVEL_INFO;

        char buf[1024];
        vsnprintf(buf, sizeof(buf), fmt, va);

        // Strip trailing newline (tracing adds its own).
        size_t len = strlen(buf);
        if (len > 0 && buf[len - 1] == '\n')
            buf[len - 1] = '\0';

        s_log_callback(level, buf);
        return true;
    }
};

static RdpLoggingInterface s_logging_interface;

void rdp_set_log_callback(void (*callback)(uint32_t level, const char *msg))
{
    s_log_callback = callback;
    Util::set_thread_logging_interface(callback ? &s_logging_interface : nullptr);
}

// -- Internal types --

struct RdpContext {
    std::unique_ptr<Context> context;
    std::unique_ptr<Device> device;
};

struct RdpRenderer {
    RdpContext *ctx;
    std::unique_ptr<RDP::CommandProcessor> processor;
    uint32_t rdram_size;
};

// -- Vulkan context --

void *rdp_context_create(
    const char *const *instance_ext, uint32_t num_instance_ext,
    const char *const *device_ext, uint32_t num_device_ext)
{
    if (!Context::init_loader(nullptr))
        return nullptr;

    auto context = std::make_unique<Context>();
    if (!context->init_instance_and_device(
            instance_ext, num_instance_ext,
            device_ext, num_device_ext, 0))
        return nullptr;

    auto device = std::make_unique<Device>();
    device->set_context(*context);

    auto *ctx = new RdpContext();
    ctx->context = std::move(context);
    ctx->device = std::move(device);
    return ctx;
}

void rdp_context_destroy(void *ctx)
{
    delete static_cast<RdpContext *>(ctx);
}

void *rdp_context_get_instance(void *ctx)
{
    return static_cast<RdpContext *>(ctx)->context->get_instance();
}

void *rdp_context_get_physical_device(void *ctx)
{
    return static_cast<RdpContext *>(ctx)->context->get_gpu();
}

void *rdp_context_get_device(void *ctx)
{
    return static_cast<RdpContext *>(ctx)->context->get_device();
}

void *rdp_context_get_queue(void *ctx, uint32_t *family_index)
{
    auto &info = static_cast<RdpContext *>(ctx)->context->get_queue_info();
    // Use the graphics queue (QUEUE_INDEX_GRAPHICS = 0)
    if (family_index)
        *family_index = info.family_indices[0];
    return info.queues[0];
}

// -- Renderer --

void *rdp_renderer_create(void *ctx, uint32_t rdram_size, uint32_t flags)
{
    auto *context = static_cast<RdpContext *>(ctx);

    auto renderer = std::make_unique<RdpRenderer>();
    renderer->ctx = context;
    renderer->rdram_size = rdram_size;

    // Pass nullptr for rdram_ptr so the CommandProcessor allocates its own
    // host-coherent GPU buffer. This avoids the non-coherent path where
    // host-to-GPU uploads during scanout can overwrite GPU-rendered data.
    renderer->processor = std::make_unique<RDP::CommandProcessor>(
        *context->device,
        nullptr,
        0,                  // rdram_offset
        rdram_size,
        rdram_size / 2,     // hidden_rdram_size
        static_cast<RDP::CommandProcessorFlags>(flags));

    if (!renderer->processor->device_is_supported()) {
        return nullptr;
    }

    auto *ptr = renderer.release();
    return ptr;
}

void rdp_renderer_destroy(void *renderer)
{
    auto *r = static_cast<RdpRenderer *>(renderer);
    // Ensure all GPU work completes before destroying the CommandProcessor,
    // otherwise its destructor may race with in-flight commands.
    uint64_t timeline = r->processor->signal_timeline();
    r->processor->wait_for_timeline(timeline);
    delete r;
}

uint8_t *rdp_renderer_get_rdram(void *renderer)
{
    auto *r = static_cast<RdpRenderer *>(renderer);
    // The CommandProcessor's RDRAM is a host-coherent GPU buffer.
    // begin_read_rdram() maps it for host access (persistent on coherent buffers).
    return static_cast<uint8_t *>(r->processor->begin_read_rdram());
}

uint32_t rdp_renderer_get_rdram_size(void *renderer)
{
    return static_cast<RdpRenderer *>(renderer)->rdram_size;
}

void rdp_renderer_begin_frame(void *renderer)
{
    static_cast<RdpRenderer *>(renderer)->processor->begin_frame_context();
}

void rdp_renderer_enqueue(void *renderer, const uint32_t *words, uint32_t num_words)
{
    // RDP command lengths in 64-bit words, indexed by command byte (bits [29:24]).
    // Most commands are 1 word (= 2 x 32-bit words). Triangle commands are larger.
    static const unsigned cmd_len_lut[64] = {
        1, 1, 1, 1, 1, 1, 1, 1,        // 0x00-0x07: nop/invalid
        4, 6, 12, 14, 12, 14, 20, 22,   // 0x08-0x0F: triangles
        1, 1, 1, 1, 1, 1, 1, 1,        // 0x10-0x17: unused
        1, 1, 1, 1, 1, 1, 1, 1,        // 0x18-0x1F: unused
        1, 1, 1, 1, 2, 2, 1, 1,        // 0x20-0x27: tex rect (0x24,0x25) = 2
        1, 1, 1, 1, 1, 1, 1, 1,        // 0x28-0x2F: sync/scissor/modes
        1, 1, 1, 1, 1, 1, 1, 1,        // 0x30-0x37: load/tile/fill/color
        1, 1, 1, 1, 1, 1, 1, 1,        // 0x38-0x3F: color regs/combine/images
    };

    auto *proc = static_cast<RdpRenderer *>(renderer)->processor.get();

    // Parse the word stream and enqueue each command individually.
    // parallel-rdp's enqueue_command_direct() processes exactly one command
    // per call, so we must split the stream ourselves.
    uint32_t i = 0;
    while (i < num_words) {
        uint32_t cmd = (words[i] >> 24) & 63;
        uint32_t len_64 = cmd_len_lut[cmd];
        uint32_t len_32 = len_64 * 2;

        if (i + len_32 > num_words)
            break;

        proc->enqueue_command(len_32, &words[i]);
        i += len_32;
    }
}

void rdp_renderer_set_vi_register(void *renderer, uint32_t reg, uint32_t value)
{
    static_cast<RdpRenderer *>(renderer)->processor->set_vi_register(
        static_cast<RDP::VIRegister>(reg), value);
}

void *rdp_renderer_scanout(void *renderer, uint32_t *width, uint32_t *height)
{
    auto *r = static_cast<RdpRenderer *>(renderer);

    RDP::ScanoutOptions options = {};
    options.persist_frame_on_invalid_input = true;
    options.blend_previous_frame = true;
    options.upscale_deinterlacing = false;

    Vulkan::ImageHandle image = r->processor->scanout(options);
    if (!image) {
        *width = 0;
        *height = 0;
        return nullptr;
    }

    *width = image->get_width();
    *height = image->get_height();

    // Return the raw VkImage handle.
    // The ImageHandle (ref-counted) keeps the image alive as long as the
    // CommandProcessor holds its internal reference (until next scanout).
    return image->get_image();
}

int rdp_renderer_scanout_sync(
    void *renderer,
    uint8_t *buffer, uint32_t buffer_size,
    uint32_t *width, uint32_t *height)
{
    auto *r = static_cast<RdpRenderer *>(renderer);

    std::vector<RDP::RGBA> colors;
    unsigned w = 0, h = 0;

    RDP::ScanoutOptions options = {};
    options.persist_frame_on_invalid_input = true;
    options.blend_previous_frame = true;
    options.upscale_deinterlacing = false;

    r->processor->scanout_sync(colors, w, h, options);

    if (w == 0 || h == 0 || colors.empty()) {
        *width = 0;
        *height = 0;
        return 0;
    }

    *width = w;
    *height = h;

    uint32_t needed = w * h * 4;
    if (buffer_size < needed)
        return 0;

    std::memcpy(buffer, colors.data(), needed);
    return 1;
}

void rdp_renderer_flush(void *renderer)
{
    auto *r = static_cast<RdpRenderer *>(renderer);
    uint64_t timeline = r->processor->signal_timeline();
    r->processor->wait_for_timeline(timeline);
}

uint64_t rdp_renderer_signal_timeline(void *renderer)
{
    auto *r = static_cast<RdpRenderer *>(renderer);
    return r->processor->signal_timeline();
}

void rdp_renderer_wait_for_timeline(void *renderer, uint64_t value)
{
    auto *r = static_cast<RdpRenderer *>(renderer);
    r->processor->wait_for_timeline(value);
}