// Copyright (c) 2020 ETH Zurich, University of Bologna
// All rights reserved.
//
// This code is under development and not yet released to the public.
// Until it is released, the code is under the copyright of ETH Zurich and
// the University of Bologna, and may contain confidential and/or unpublished
// work. Any reuse/redistribution is strictly forbidden without written
// permission from ETH Zurich.
//
// Thomas Benz <tbenz@ethz.ch>

/// Splits a generic 1D transfer in AXI-conform transfers
module axi_dma_burst_reshaper #(
    /// Data width of the AXI bus
    parameter int unsigned DataWidth = -1,
    /// Address width of the AXI bus
    parameter int unsigned AddrWidth = -1,
    /// ID width of the AXI bus
    parameter int unsigned IdWidth = -1,
    /// Arbitrary 1D burst request definition:
    /// - id: the AXI id used - this id should be constant, as the DMA does not support reordering
    /// - src, dst: source and destination address, same width as the AXI 4 interface
    /// - num_bytes: the length of the contiguous 1D transfer requested, can be up to 32/64 bit long
    ///              num_bytes will be interpreted as an unsigned number
    ///              A value of 0 will cause the backend to discard the transfer prematurely
    /// - src_cache, dst_cache: the configuration of the cache fields in the AX beats
    /// - src_burst, dst_burst: currently only incremental bursts are supported (2'b01)
    /// - decouple_rw: if set to true, there is no longer exactly one AXI write_request issued for
    ///   every read request. This mode can improve performance of unaligned transfers when crossing
    ///   the AXI page boundaries.
    /// - deburst: if set, the DMA will split all bursts in single transfers
    parameter type         burst_req_t = logic,
    /// Read request definition. Includes:
    /// - ax descriptor
    ///  - id: AXI id
    ///  - last: last transaction in burst
    ///  - address: address of burst
    ///  - length: burst length
    ///  - size: bytes in each burst
    ///  - burst: burst type; only INC supported
    ///  - cache: cache type
    /// - r descriptor
    ///  - offset: initial misalignment
    ///  - tailer: final misalignment
    ///  - shift: amount the data needs to be shifted to realign it
    parameter type         read_req_t = logic,
    /// Write request definition. Includes:
    /// - ax descriptor
    ///  - id: AXI id
    ///  - last: last transaction in burst
    ///  - address: address of burst
    ///  - length: burst length
    ///  - size: bytes in each burst
    ///  - burst: burst type; only INC supported
    ///  - cache: cache type
    /// - w descriptor
    ///  - offset: initial misalignment
    ///  - tailer: final misalignment
    ///  - num_beats: number of beats in the burst
    ///  - is_single: burst length is 0
    parameter type         write_req_t = logic

) (
    /// Clock
    input  logic          clk_i,
    /// Asynchronous reset, active low
    input  logic          rst_ni,
    /// Arbitrary 1D burst request
    input  burst_req_t    burst_req_i,

    /// Handshake: burst request is valid
    input  logic          valid_i,
    /// Handshake: burst request can be accepted
    output logic          ready_o,

    /// Write transfer request
    output write_req_t    write_req_o,
    /// Read transfer request
    output read_req_t     read_req_o,

    /// Handshake: read transfer request valid
    output logic          r_valid_o,
    /// Handshake: read transfer request ready
    input  logic          r_ready_i,
    /// Handshake: write transfer request valid
    output logic          w_valid_o,
    /// Handshake: write transfer request ready
    input  logic          w_ready_i
);

    localparam int unsigned StrbWidth     = DataWidth / 8;
    localparam int unsigned OffsetWidth   = $clog2(StrbWidth);
    localparam int unsigned PageSize      = (256 * StrbWidth > 4096) ? 4096 :  256 * StrbWidth;
    localparam int unsigned PageAddrWidth = $clog2(PageSize);
    /// Offset type
    typedef logic [  OffsetWidth-1:0] offset_t;
    /// Address Type
    typedef logic [    AddrWidth-1:0] addr_t;
    /// AXI ID Type
    typedef logic [      IdWidth-1:0] axi_id_t;

    /// Type containing burst description for each channel independently
    typedef struct packed {
        axi_id_t          id;
        addr_t            addr;
        addr_t            num_bytes;
        axi_pkg::cache_t  cache;
        axi_pkg::burst_t  burst;
        logic             valid;
    } burst_chan_t;

    /// Type containing burst description
    typedef struct packed {
        burst_chan_t  src;
        burst_chan_t  dst;
        offset_t      shift;
        logic         decouple_rw;
        logic         deburst;
    } burst_decoupled_t;

    //--------------------------------------
    // state; internally hold one transfer
    //--------------------------------------
    burst_decoupled_t burst_d, burst_q;

    //--------------------------------------
    // page boundary check
    //--------------------------------------
    logic [PageAddrWidth-1:0] r_page_offset;
    logic [PageAddrWidth  :0] r_num_bytes_to_pb;
    logic [PageAddrWidth-1:0] w_page_offset;
    logic [PageAddrWidth  :0] w_num_bytes_to_pb;
    logic [PageAddrWidth  :0] c_num_bytes_to_pb;

    always_comb begin : proc_write_page_boundry_check
        // implement deburst operation
        if (burst_q.deburst) begin
            // deburst
            // read pages
            r_page_offset     = burst_q.src.addr[OffsetWidth-1:0];
            // how many transfers are remaining until the end of the bus?
            r_num_bytes_to_pb = (StrbWidth - r_page_offset) % (2 * StrbWidth);

            // write pages
            w_page_offset     = burst_q.dst.addr[OffsetWidth-1:0];
            // how many transfers are remaining until the end of the bus?
            w_num_bytes_to_pb = (StrbWidth - w_page_offset) % (2 * StrbWidth);
        end else begin
            // bursts allowed
            // read pages
            r_page_offset     = burst_q.src.addr[PageAddrWidth-1:0];
            // how many transfers are remaining in current page?
            r_num_bytes_to_pb = PageSize - r_page_offset;

            // write pages
            w_page_offset     = burst_q.dst.addr[PageAddrWidth-1:0];
            // how many transfers are remaining in current page?
            w_num_bytes_to_pb = PageSize - w_page_offset;
        end
        // how many transfers are remaining when concerning both r/w pages?
        // take the boundary that is closer
        c_num_bytes_to_pb = (r_num_bytes_to_pb > w_num_bytes_to_pb) ?
                             w_num_bytes_to_pb : r_num_bytes_to_pb;

    end

    //--------------------------------------
    // Synchronized R/W process
    //--------------------------------------
    logic [PageAddrWidth:0] r_num_bytes_possible;
    logic [PageAddrWidth:0] r_num_bytes;
    logic                   r_finish;
    logic [OffsetWidth-1:0] r_addr_offset;

    logic [PageAddrWidth:0] w_num_bytes_possible;
    logic [PageAddrWidth:0] w_num_bytes;
    logic                   w_finish;
    logic [OffsetWidth-1:0] w_addr_offset;

    always_comb begin : proc_read_write_transaction

        // default: keep last state
        burst_d = burst_q;

        //--------------------------------------
        // Specify read transaction
        //--------------------------------------
        // max num bytes according to page(s)
        r_num_bytes_possible = (burst_q.decouple_rw == 1'b1) ?
                                r_num_bytes_to_pb : c_num_bytes_to_pb;

        // more bytes remaining than we can send
        if (burst_q.src.num_bytes > r_num_bytes_possible) begin
            r_num_bytes           = r_num_bytes_possible;
            // calculate remainder
            burst_d.src.num_bytes = burst_q.src.num_bytes - r_num_bytes_possible;
            // not finished
            r_finish              = 1'b0;
            // next address, depends on burst type. only type 01 is supported yet
            burst_d.src.addr      = (burst_q.src.burst == axi_pkg::BURST_INCR) ?
                                    burst_q.src.addr + r_num_bytes : burst_q.src.addr;

        // remaining bytes fit in one burst
        // reset storage for the read channel to stop this channel
        end else begin
            r_num_bytes   = burst_q.src.num_bytes[PageAddrWidth:0];
            // default: when a transfer is finished, set it to 0
            burst_d.src   = '0;
            // finished
            r_finish      = 1'b1;
        end

        // calculate the address offset aligned to transfer sizes.
        r_addr_offset     = burst_q.src.addr[OffsetWidth-1:0];

        // create the AR request
        read_req_o.ar.addr     = { burst_q.src.addr[AddrWidth-1:OffsetWidth],
                                   {{OffsetWidth}{1'b0}} };
        read_req_o.ar.len      = ((r_num_bytes + r_addr_offset - 1) >> OffsetWidth);
        read_req_o.ar.size     = axi_pkg::size_t'(OffsetWidth);
        read_req_o.ar.id       = burst_q.src.id;
        read_req_o.ar.last     = r_finish;
        read_req_o.ar.burst    = burst_q.src.burst;
        read_req_o.ar.cache    = burst_q.src.cache;
        r_valid_o              = burst_q.decouple_rw ?
                                 burst_q.src.valid : burst_q.src.valid & w_ready_i;

        // create the R request
        read_req_o.r.offset = r_addr_offset;
        read_req_o.r.tailer = OffsetWidth'(r_num_bytes + r_addr_offset);
        // shift is determined on a per 1D request base
        read_req_o.r.shift  = burst_q.shift;

        //--------------------------------------
        // Specify write transaction
        //--------------------------------------
        // max num bytes according to page(s)
        w_num_bytes_possible = (burst_q.decouple_rw == 1'b1) ?
                                w_num_bytes_to_pb : c_num_bytes_to_pb;

        // more bytes remaining than we can send
        if (burst_q.dst.num_bytes > w_num_bytes_possible) begin
            w_num_bytes           = w_num_bytes_possible;
            // calculate remainder
            burst_d.dst.num_bytes = burst_q.dst.num_bytes - w_num_bytes_possible;
            // not finished
            w_finish              = 1'b0;
            // next address, depends on burst type. only type 01 is supported yet
            burst_d.dst.addr      = (burst_q.dst.burst == axi_pkg::BURST_INCR) ?
                                     burst_q.dst.addr + w_num_bytes : burst_q.dst.addr;

        // remaining bytes fit in one burst
        // reset storage for the write channel to stop this channel
        end else begin
            w_num_bytes   = burst_q.dst.num_bytes[PageAddrWidth:0];
            // default: when a transfer is finished, set it to 0
            burst_d.dst   = '0;
            // finished
            w_finish      = 1'b1;
        end

        // calculate the address offset aligned to transfer sizes.
        w_addr_offset     = burst_q.dst.addr[OffsetWidth-1:0];

        // create the AW request
        write_req_o.aw.addr     = { burst_q.dst.addr[AddrWidth-1:OffsetWidth],
                                    {{OffsetWidth}{1'b0}} };
        write_req_o.aw.len      = ((w_num_bytes + w_addr_offset - 1) >> OffsetWidth);
        write_req_o.aw.size     = axi_pkg::size_t'(OffsetWidth);
        write_req_o.aw.id       = burst_q.dst.id;
        // hand over internal transaction id
        write_req_o.aw.last     = w_finish;
        write_req_o.aw.burst    = burst_q.dst.burst;
        write_req_o.aw.cache    = burst_q.dst.cache;
        w_valid_o               = burst_q.decouple_rw ?
                                  burst_q.dst.valid : burst_q.dst.valid & r_ready_i;

        // create the W request
        write_req_o.w.offset    = w_addr_offset;
        write_req_o.w.tailer    = OffsetWidth'(w_num_bytes + w_addr_offset);
        write_req_o.w.num_beats = write_req_o.aw.len;
        // is the transfer be only one beat in length? Counters don't have to be initialized then.
        write_req_o.w.is_single = (write_req_o.aw.len == '0);

        //--------------------------------------
        // Module control
        //--------------------------------------
        ready_o = r_finish & w_finish & valid_i & r_ready_i & w_ready_i;

        //--------------------------------------
        // Refill
        //--------------------------------------
        // new request is taken in if both r and w machines are ready.
        if (ready_o) begin
            // unfortunately this is unpacked
            burst_d.src.id          = burst_req_i.id;
            burst_d.src.addr        = burst_req_i.src;
            burst_d.src.num_bytes   = burst_req_i.num_bytes;
            burst_d.src.cache       = burst_req_i.cache_src;
            burst_d.src.burst       = burst_req_i.burst_src;
            // check if transfer is possible -> num_bytes has to be larger than 0
            burst_d.src.valid       = (burst_req_i.num_bytes == '0) ? 1'b0 : valid_i;

            burst_d.dst.id          = burst_req_i.id;
            burst_d.dst.addr        = burst_req_i.dst;
            burst_d.dst.num_bytes   = burst_req_i.num_bytes;
            burst_d.dst.cache       = burst_req_i.cache_dst;
            burst_d.dst.burst       = burst_req_i.burst_dst;
            // check if transfer is possible -> num_bytes has to be larger than 0
            burst_d.dst.valid       = (burst_req_i.num_bytes == '0) ? 1'b0 : valid_i;

            burst_d.decouple_rw     = burst_req_i.decouple_rw;
            burst_d.deburst         = burst_req_i.deburst;
            // shift is calculated for each 1D transfer
            burst_d.shift           = burst_req_i.src[OffsetWidth-1:0] -
                                      burst_req_i.dst[OffsetWidth-1:0];

            // assertions
            // pragma translate_off
            `ifndef VERILATOR
            assert property (@(posedge clk_i) disable iff (~rst_ni)
                    (valid_i |-> burst_req_i.burst_src inside {axi_pkg::BURST_INCR})) else
                $fatal(1, "Unsupported DMA src_burst request..");
            assert property (@(posedge clk_i) disable iff (~rst_ni)
                    (valid_i |-> burst_req_i.burst_dst inside {axi_pkg::BURST_INCR})) else
                $fatal(1, "Unsupported DMA dst_burst request.");
            `endif
            // pragma translate_on
        end
    end

    //--------------------------------------
    // State
    //--------------------------------------
    always_ff @(posedge clk_i or negedge rst_ni) begin
        if (!rst_ni) begin
            burst_q.decouple_rw        <= '0;
            burst_q.deburst            <= '0;
            burst_q.shift              <= '0;
            burst_q.src                <= '0;
            burst_q.dst                <= '0;
        end else begin
            burst_q.decouple_rw        <= burst_d.decouple_rw;
            burst_q.deburst            <= burst_d.deburst;
            burst_q.shift              <= burst_d.shift;
            // couple read and write machines in the coupled test
            if (burst_d.decouple_rw) begin
                if (r_ready_i)              burst_q.src <= burst_d.src;
                if (w_ready_i)              burst_q.dst <= burst_d.dst;
            end else begin
                if (r_ready_i & w_ready_i)  burst_q.src <= burst_d.src;
                if (w_ready_i & r_ready_i)  burst_q.dst <= burst_d.dst;
            end
        end
    end
endmodule : axi_dma_burst_reshaper