// Copyright (c) 2020 ETH Zurich, University of Bologna
// All rights reserved.
//
// This code is under development and not yet released to the public.
// Until it is released, the code is under the copyright of ETH Zurich and
// the University of Bologna, and may contain confidential and/or unpublished
// work. Any reuse/redistribution is strictly forbidden without written
// permission from ETH Zurich.
//
// Thomas Benz <tbenz@ethz.ch>

/// Data path for the AXI DMA. This modules handles the R/W channel of the
/// AXI protocol.
/// Module gets read stream, realigns data and emits a write stream.
/// AXI-like valid/ready handshaking is used to communicate with the rest
/// of the backend.
module axi_dma_data_path #(
    /// Data width of the AXI bus
    parameter int DataWidth = -1,
    /// Number of elements the realignment buffer can hold. To achieve
    /// full performance a depth of 3 is minimally required.
    parameter int BufferDepth = -1,
    // DO NOT OVERWRITE THIS PARAMETER
    parameter int StrbWidth = DataWidth / 8,
    parameter int OffsetWidth = $clog2(StrbWidth)
) (
    // status signals
    /// Clock
    input  logic                   clk_i,
    /// Asynchronous reset, active low
    input  logic                   rst_ni,

    // handshaking signals
    /// Handshake: read side of data path is presented with a valid request
    input  logic                   r_dp_valid_i,
    /// Handshake: read side of data path is ready to accept new requests
    output logic                   r_dp_ready_o,
    /// Handshake: write side of data path is presented with a valid request
    input  logic                   w_dp_valid_i,
    /// Handshake: write side of data path is ready to accept new requests
    output logic                   w_dp_ready_o,

    // status signal
    /// High if the data path is idle
    output logic                   data_path_idle_o,

    // r-channel
    /// Read data from the AXI bus
    input  logic [DataWidth-1:0]   r_data_i,
    /// Valid signal of the AXI r channel
    input  logic                   r_valid_i,
    /// Last signal of the AXI r channel
    input  logic                   r_last_i,
    /// Response signal of the AXI r channel
    input  logic [            1:0] r_resp_i,
    /// Ready signal of the AXI r channel
    output logic                   r_ready_o,

    /// number of bytes the end of the read transfer is short to reach a
    /// Bus-aligned boundary
    input  logic [OffsetWidth-1:0] r_tailer_i,
    /// number of bytes the read transfers starts after a
    /// Bus-aligned boundary
    input  logic [OffsetWidth-1:0] r_offset_i,
    /// The amount the read data has to be shifted to write-align it
    input  logic [OffsetWidth-1:0] r_shift_i,

    // w-channel
    /// Write data of the AXI bus
    output logic [DataWidth-1:0]   w_data_o,
    /// Write strobe of the AXI bus
    output logic [StrbWidth-1:0]   w_strb_o,
    /// Valid signal of the AXI w channel
    output logic                   w_valid_o,
    /// Last signal of the AXI w channel
    output logic                   w_last_o,
    /// Ready signal of the AXI w channel
    input  logic                   w_ready_i,

    /// number of bytes the write transfers starts after a
    /// Bus-aligned boundary
    input  logic [OffsetWidth-1:0] w_offset_i,
    /// number of bytes the end of the write transfer is short to reach a
    /// Bus-aligned boundary
    input  logic [OffsetWidth-1:0] w_tailer_i,
    /// Number of beats requested by this transfer
    input  logic [            7:0] w_num_beats_i,
    /// True if the transfer only consists of a single beat
    input  logic                   w_is_single_i
);

    // buffer contains 8 data bits per FIFO
    // buffer is at least 3 deep to prevent stalls

    // 64 bit DATA Width example:
    // DDDDDDDD DDDDDDDD DDDDDDDD DDDDDDDD DDDDDDDD DDDDDDDD DDDDDDDD DDDDDDDD <- head
    // DDDDDDDD DDDDDDDD DDDDDDDD DDDDDDDD DDDDDDDD DDDDDDDD DDDDDDDD DDDDDDDD
    // DDDDDDDD DDDDDDDD DDDDDDDD DDDDDDDD DDDDDDDD DDDDDDDD DDDDDDDD DDDDDDDD <- tail
    // -byte7--|-byte6--|-byte5--|-byte4--|-byte3--|-byte2--|-byte1--|-byte0--|


    //--------------------------------------
    // Mask pre-calculation
    //--------------------------------------
    // in contiguous transfers that are unaligned, there will be some
    // invalid bytes at the beginning and the end of the stream
    // example: 25B in 64 bit system
    // iiiivvvv|vvvvvvvv|vvvvvvvv|vvvvviii
    // last msk|----full mask----|first msk

    // offsets needed for masks to fill/empty buffer
    logic [StrbWidth-1:0] r_first_mask;
    logic [StrbWidth-1:0] r_last_mask;
    logic [StrbWidth-1:0] w_first_mask;
    logic [StrbWidth-1:0] w_last_mask;

    // read align masks
    assign r_first_mask = '1 << r_offset_i;
    assign r_last_mask  = '1 >> (StrbWidth - r_tailer_i);

    // write align masks
    assign w_first_mask = '1 << w_offset_i;
    assign w_last_mask  = '1 >> (StrbWidth - w_tailer_i);


    //--------------------------------------
    // Barrel shifter
    //--------------------------------------
    // data arrives in chuncks of length DATA_WDITH, the buffer will be filled with
    // the realigned data. StrbWidth bytes will be inserted starting from the
    // provided address, overflows will naturally wrap

    // signals connected to the buffer
    logic [StrbWidth-1:0][7:0] buffer_in;

    // read aligned in mask. needs to be rotated together with the data before
    // it can be used to fill in valid data into the buffer
    logic [StrbWidth-1:0]      read_aligned_in_mask;

    // in mask is write aligned, so it is the result of the read aligned in mask
    // that is rotated together with the data in the barrel shifter
    logic [StrbWidth-1:0]      in_mask;

    // a barrel shifter is a concatenation of the same array with itself and a normal
    // shift.
    assign buffer_in = {r_data_i, r_data_i} >> (r_shift_i * 8);
    assign in_mask   = {read_aligned_in_mask, read_aligned_in_mask}  >> r_shift_i;

    //--------------------------------------
    // In mask generation
    //--------------------------------------
    // in the case of unaligned reads -> not all data is valid
    logic is_first_r, is_first_r_d;

    always_comb begin : proc_in_mask_generator
        // default case: all ones
        read_aligned_in_mask = '1;
        // is first word: some bytes at the beginning may be invalid
        read_aligned_in_mask = is_first_r ?
            read_aligned_in_mask & r_first_mask : read_aligned_in_mask;
        // is last word in write burst: some bytes at the end may be invalid
        if (r_tailer_i != '0) begin
            read_aligned_in_mask = r_last_i ?
                read_aligned_in_mask & r_last_mask : read_aligned_in_mask;
        end
    end

    //--------------------------------------
    // Read control
    //--------------------------------------
    logic [StrbWidth-1:0] buffer_full;
    logic [StrbWidth-1:0] buffer_push;
    logic                 full;
    // this signal is used for pushing data to the control fifo
    logic                 push;

    always_comb begin : proc_read_control
        // sticky is first bit for read
        if (r_valid_i & !r_last_i) begin
            // new transfer has started
            is_first_r_d = 1'b0;
        end else if (r_last_i & r_valid_i) begin
            // finish read burst
            is_first_r_d = 1'b1;
        end else begin
            // no change
            is_first_r_d = is_first_r;
        end

        // the buffer can be pushed to if all the masked fifo buffers (in_mask) are not full.
        full = |(buffer_full & in_mask);
        // the read can accept data if the buffer is not full
        r_ready_o = ~full;

        // once valid data is applied, it can be pushed in all the selected (in_mask) buffers
        push        = r_valid_i && ~full;
        buffer_push = push ? in_mask : '0;

        // r_dp_ready_o is triggered by the last element arriving from the read
        r_dp_ready_o = r_dp_valid_i && r_last_i && r_valid_i && ~full;;
    end

    //--------------------------------------
    // Out mask generation -> wstrb mask
    //--------------------------------------
    // only pop the data actually needed for write from the buffer,
    // determine valid data to pop by calculation the wstrb
    logic [StrbWidth-1:0] out_mask;
    logic                      is_first_w;
    logic                      is_last_w;

    always_comb begin : proc_out_mask_generator
        // default case: all ones
        out_mask = '1;
        // is first word: some bytes at the beginning may be invalid
        out_mask = is_first_w ? (out_mask & w_first_mask) : out_mask;
        // is last word in write burst: some bytes at the end may be invalid
        if (w_tailer_i != '0) begin
            out_mask = is_last_w ? out_mask & w_last_mask : out_mask;
        end
    end

    //--------------------------------------
    // Write control
    //--------------------------------------
    // once buffer contains a full line -> all fifos are non-empty
    // push it out.
    // signals connected to the buffer
    logic [StrbWidth-1:0][7:0] buffer_out;
    logic [StrbWidth-1:0]      buffer_empty;
    logic [StrbWidth-1:0]      buffer_pop;

    // write is decoupled from read, due to misalignments in the read/write
    // addresses, page crossing can be encountered at any time.
    // To handle this efficiently, a 2-to-1 or 1-to-2 mapping of r/w beats
    // is required. The write unit needs to keep track of progress through
    // a counter and cannot use `r_last` for that.
    logic [7:0] w_num_beats_d, w_num_beats_q;
    logic       w_cnt_valid_d, w_cnt_valid_q;

    // data from buffer is popped
    logic       pop;
    // write happens
    logic       write_happening;
    // buffer is ready to write the requested data
    logic       ready_to_write;
    // first transfer is possible - this signal is used to detect
    // the first write transfer in a burst
    logic       first_possible;
    // buffer is completely empty
    logic       buffer_clean;

    always_comb begin : proc_write_control
        // counter
        w_num_beats_d   = w_num_beats_q;
        w_cnt_valid_d   = w_cnt_valid_q;
        // buffer ctrl
        pop             = 1'b0;
        buffer_pop      =  'b0;
        write_happening = 1'b0;
        ready_to_write  = 1'b0;
        first_possible  = 1'b0;
        // bus signals
        w_valid_o       = 1'b0;
        w_data_o        =  '0;
        w_strb_o        =  '0;
        w_last_o        = 1'b0;
        // mask control
        is_first_w      = 1'b0;
        is_last_w       = 1'b0;
        // data flow
        w_dp_ready_o    = 1'b0;


        // all elements needed (defined by the mask) are in the buffer and the buffer is non-empty
        ready_to_write  = ((~buffer_empty & out_mask) == out_mask) && (buffer_empty != '1);

        // data needed by the first mask is available in the buffer -> r_first happened for sure
        // this signal can be high during a transfer as well, it needs to be masked
        first_possible  = ((~buffer_empty & w_first_mask) == w_first_mask) && (buffer_empty != '1);

        // the buffer is completely empty (debug only signal)
        buffer_clean    = &(buffer_empty);

        // write happening: both the bus (w_ready) and the buffer (ready_to_write) is high
        write_happening = ready_to_write & w_ready_i;

        // signal the control fifo it could be popped
        pop             = write_happening;

        // the main buffer is conditionally to the write mask popped
        buffer_pop      = write_happening ? out_mask : '0;

        // signal the bus that we are ready
        w_valid_o       = ready_to_write;

        // control the write to the bus apply data to the bus only if data should be written
        if (ready_to_write == 1'b1) begin
            // assign data from buffers, mask out non valid entries
            for (int i = 0; i < StrbWidth; i++) begin
                w_data_o[i*8 +: 8] = out_mask[i] ? buffer_out[i] : 8'b0;
            end
            // assign the out mask to the strobe
            w_strb_o = out_mask;
        end

        // differentiate between the burst and non-burst case. If a transfer
        // consists just of one beat the counters are disabled
        if (w_is_single_i) begin
            // in the single case the transfer is both first and last.
            is_first_w = 1'b1;
            is_last_w  = 1'b1;

        // in the bursted case the counters are needed to keep track of the progress of sending
        // beats. The w_last_o depends on the state of the counter
        end else begin
            // first transfer happens as soon as a) the buffer is ready for a first transfer and b)
            // the counter is currently invalid
            is_first_w = first_possible & ~w_cnt_valid_q;

            // last happens as soon as a) the counter is valid and b) the counter is now down to 1
            is_last_w  = w_cnt_valid_q & (w_num_beats_q == 8'h01);

            // load the counter with data in a first cycle, only modifying state if bus is ready
            if (is_first_w && write_happening) begin
                w_num_beats_d = w_num_beats_i;
                w_cnt_valid_d = 1'b1;
            end

            // if we hit the last element, invalidate the counter, only modifying state
            // if bus is ready
            if (is_last_w && write_happening) begin
                w_cnt_valid_d = 1'b0;
            end

            // count down the beats if the counter is valid and valid data is written to the bus
            if (w_cnt_valid_q && write_happening) w_num_beats_d = w_num_beats_q - 8'h01;
        end

        // the w_last_o signal should only be applied to the bus if an actual transfer happens
        w_last_o = is_last_w & ready_to_write;

        // we are ready for the next transfer internally, once the w_last_o signal is applied
        w_dp_ready_o = is_last_w & write_happening;
    end


    //--------------------------------------
    // Buffer - implemented as fifo
    //--------------------------------------
    logic control_empty;

    for (genvar i = 0; i < StrbWidth; i++) begin : fifo_buffer
        fifo_v3 #(
            .FALL_THROUGH   ( 1'b0         ),
            .DATA_WIDTH     ( 8            ),
            .DEPTH          ( BufferDepth  )
        ) i_fifo_buffer (
            .clk_i          ( clk_i           ),
            .rst_ni         ( rst_ni          ),
            .flush_i        ( 1'b0            ),
            .testmode_i     ( 1'b0            ),
            .full_o         ( buffer_full [i] ),
            .empty_o        ( buffer_empty[i] ),
            .usage_o        ( ),
            .data_i         ( buffer_in   [i] ),
            .push_i         ( buffer_push [i] ),
            .data_o         ( buffer_out  [i] ),
            .pop_i          ( buffer_pop  [i] )
        );
    end

    //--------------------------------------
    // Module Control
    //-------------------------------------
    assign data_path_idle_o = !(r_dp_valid_i | r_dp_ready_o |
                                w_dp_valid_i | w_dp_ready_o | !buffer_clean);

    always_ff @(posedge clk_i or negedge rst_ni) begin : proc_ff
        if (!rst_ni) begin
            is_first_r    <= 1'b1;
            w_cnt_valid_q <= 1'b0;
            w_num_beats_q <= 8'h0;
        end else begin
            // running_q <= running_d;
            if (r_valid_i & r_ready_o) is_first_r <= is_first_r_d;
            w_cnt_valid_q <= w_cnt_valid_d;
            w_num_beats_q <= w_num_beats_d;
        end
    end

endmodule : axi_dma_data_path