// Copyright 2018 ETH Zurich and University of Bologna.
// Copyright and related rights are licensed under the Solderpad Hardware
// License, Version 0.51 (the "License"); you may not use this file except in
// compliance with the License.  You may obtain a copy of the License at
// http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
// or agreed to in writing, software, hardware and materials distributed under
// this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
//
// Author: Florian Zaruba, ETH Zurich
// Date: 19.04.2017
// Description: Load Store Unit, handles address calculation and memory interface signals

import ariane_pkg::*;

module load_store_unit #(
    parameter int unsigned ASID_WIDTH = 1
)(
    input  logic                     clk_i,
    input  logic                     rst_ni,
    input  logic                     flush_i,
    output logic                     no_st_pending_o,
    input  logic                     amo_valid_commit_i,

    input  fu_data_t                 fu_data_i,
    output logic                     lsu_ready_o,              // FU is ready e.g. not busy
    input  logic                     lsu_valid_i,              // Input is valid

    output logic [TRANS_ID_BITS-1:0] load_trans_id_o,          // ID of scoreboard entry at which to write back
    output logic [63:0]              load_result_o,
    output logic                     load_valid_o,
    output exception_t               load_exception_o,         // to WB, signal exception status LD exception

    output logic [TRANS_ID_BITS-1:0] store_trans_id_o,         // ID of scoreboard entry at which to write back
    output logic [63:0]              store_result_o,
    output logic                     store_valid_o,
    output exception_t               store_exception_o,        // to WB, signal exception status ST exception

    input  logic                     commit_i,                 // commit the pending store
    output logic                     commit_ready_o,           // commit queue is ready to accept another commit request

    input  logic                     enable_translation_i,     // enable virtual memory translation
    input  logic                     en_ld_st_translation_i,   // enable virtual memory translation for load/stores

    // icache translation requests
    input  icache_areq_o_t           icache_areq_i,
    output icache_areq_i_t           icache_areq_o,

    input  riscv::priv_lvl_t         priv_lvl_i,               // From CSR register file
    input  riscv::priv_lvl_t         ld_st_priv_lvl_i,         // From CSR register file
    input  logic                     sum_i,                    // From CSR register file
    input  logic                     mxr_i,                    // From CSR register file
    input  logic [43:0]              satp_ppn_i,               // From CSR register file
    input  logic [ASID_WIDTH-1:0]    asid_i,                   // From CSR register file
    input  logic                     flush_tlb_i,
    // Performance counters
    output logic                     itlb_miss_o,
    output logic                     dtlb_miss_o,

    // interface to dcache
    input  dcache_req_o_t [2:0]      dcache_req_ports_i,
    output dcache_req_i_t [2:0]      dcache_req_ports_o,
    // AMO interface
    output amo_req_t                 amo_req_o,
    input  amo_resp_t                amo_resp_i
);
    // data is misaligned
    logic data_misaligned;
    // --------------------------------------
    // 1st register stage - (stall registers)
    // --------------------------------------
    // those are the signals which are always correct
    // e.g.: they keep the value in the stall case
    lsu_ctrl_t lsu_ctrl;

    logic      pop_st;
    logic      pop_ld;

    // ------------------------------
    // Address Generation Unit (AGU)
    // ------------------------------
    // virtual address as calculated by the AGU in the first cycle
    logic [63:0] vaddr_i;
    logic [7:0]  be_i;

    assign vaddr_i = $unsigned($signed(fu_data_i.imm) + $signed(fu_data_i.operand_a));

    logic                     st_valid_i;
    logic                     ld_valid_i;
    logic                     ld_translation_req;
    logic                     st_translation_req;
    logic [63:0]              ld_vaddr;
    logic [63:0]              st_vaddr;
    logic                     translation_req;
    logic                     translation_valid;
    logic [63:0]              mmu_vaddr;
    logic [63:0]              mmu_paddr;
    exception_t               mmu_exception;
    logic                     dtlb_hit;

    logic                     ld_valid;
    logic [TRANS_ID_BITS-1:0] ld_trans_id;
    logic [63:0]              ld_result;
    logic                     st_valid;
    logic [TRANS_ID_BITS-1:0] st_trans_id;
    logic [63:0]              st_result;

    logic [11:0]              page_offset;
    logic                     page_offset_matches;

    exception_t               misaligned_exception;
    exception_t               ld_ex;
    exception_t               st_ex;

    // -------------------
    // MMU e.g.: TLBs/PTW
    // -------------------
    mmu #(
        .INSTR_TLB_ENTRIES      ( 16                     ),
        .DATA_TLB_ENTRIES       ( 16                     ),
        .ASID_WIDTH             ( ASID_WIDTH             )
    ) i_mmu (
            // misaligned bypass
        .misaligned_ex_i        ( misaligned_exception   ),
        .lsu_is_store_i         ( st_translation_req     ),
        .lsu_req_i              ( translation_req        ),
        .lsu_vaddr_i            ( mmu_vaddr              ),
        .lsu_valid_o            ( translation_valid      ),
        .lsu_paddr_o            ( mmu_paddr              ),
        .lsu_exception_o        ( mmu_exception          ),
        .lsu_dtlb_hit_o         ( dtlb_hit               ), // send in the same cycle as the request
        // connecting PTW to D$ IF
        .req_port_i             ( dcache_req_ports_i [0] ),
        .req_port_o             ( dcache_req_ports_o [0] ),
        // icache address translation requests
        .icache_areq_i          ( icache_areq_i          ),
        .icache_areq_o          ( icache_areq_o          ),
        .*
    );
    // ------------------
    // Store Unit
    // ------------------
    store_unit i_store_unit (
        .clk_i,
        .rst_ni,
        .flush_i,
        .no_st_pending_o,

        .valid_i               ( st_valid_i           ),
        .lsu_ctrl_i            ( lsu_ctrl             ),
        .pop_st_o              ( pop_st               ),
        .commit_i,
        .commit_ready_o,
        .amo_valid_commit_i,

        .valid_o               ( st_valid             ),
        .trans_id_o            ( st_trans_id          ),
        .result_o              ( st_result            ),
        .ex_o                  ( st_ex                ),
        // MMU port
        .translation_req_o     ( st_translation_req   ),
        .vaddr_o               ( st_vaddr             ),
        .paddr_i               ( mmu_paddr            ),
        .ex_i                  ( mmu_exception        ),
        .dtlb_hit_i            ( dtlb_hit             ),
        // Load Unit
        .page_offset_i         ( page_offset          ),
        .page_offset_matches_o ( page_offset_matches  ),
        // AMOs
        .amo_req_o,
        .amo_resp_i,
        // to memory arbiter
        .req_port_i             ( dcache_req_ports_i [2] ),
        .req_port_o             ( dcache_req_ports_o [2] )
    );

    // ------------------
    // Load Unit
    // ------------------
    load_unit i_load_unit (
        .valid_i               ( ld_valid_i           ),
        .lsu_ctrl_i            ( lsu_ctrl             ),
        .pop_ld_o              ( pop_ld               ),

        .valid_o               ( ld_valid             ),
        .trans_id_o            ( ld_trans_id          ),
        .result_o              ( ld_result            ),
        .ex_o                  ( ld_ex                ),
        // MMU port
        .translation_req_o     ( ld_translation_req   ),
        .vaddr_o               ( ld_vaddr             ),
        .paddr_i               ( mmu_paddr            ),
        .ex_i                  ( mmu_exception        ),
        .dtlb_hit_i            ( dtlb_hit             ),
        // to store unit
        .page_offset_o         ( page_offset          ),
        .page_offset_matches_i ( page_offset_matches  ),
        // to memory arbiter
        .req_port_i            ( dcache_req_ports_i [1] ),
        .req_port_o            ( dcache_req_ports_o [1] ),
        .*
    );

    // ----------------------------
    // Output Pipeline Register
    // ----------------------------
    pipe_reg_simple #(
        .dtype ( logic[$bits(ld_valid) + $bits(ld_trans_id) + $bits(ld_result) + $bits(ld_ex) - 1: 0]),
        .Depth ( NR_LOAD_PIPE_REGS )
    ) i_pipe_reg_load (
        .clk_i,
        .rst_ni,
        .d_i ( {ld_valid, ld_trans_id, ld_result, ld_ex} ),
        .d_o ( {load_valid_o, load_trans_id_o, load_result_o, load_exception_o} )
    );

    pipe_reg_simple #(
        .dtype ( logic[$bits(st_valid) + $bits(st_trans_id) + $bits(st_result) + $bits(st_ex) - 1: 0]),
        .Depth ( NR_STORE_PIPE_REGS )
    ) i_pipe_reg_store (
        .clk_i,
        .rst_ni,
        .d_i ( {st_valid, st_trans_id, st_result, st_ex} ),
        .d_o ( {store_valid_o, store_trans_id_o, store_result_o, store_exception_o} )
    );

    // determine whether this is a load or store
    always_comb begin : which_op

        ld_valid_i = 1'b0;
        st_valid_i = 1'b0;

        translation_req      = 1'b0;
        mmu_vaddr            = 64'b0;

        // check the operator to activate the right functional unit accordingly
        unique case (lsu_ctrl.fu)
            // all loads go here
            LOAD:  begin
                ld_valid_i           = lsu_ctrl.valid;
                translation_req      = ld_translation_req;
                mmu_vaddr            = ld_vaddr;
            end
            // all stores go here
            STORE: begin
                st_valid_i           = lsu_ctrl.valid;
                translation_req      = st_translation_req;
                mmu_vaddr            = st_vaddr;
            end
            // not relevant for the LSU
            default: ;
        endcase
    end


    // ---------------
    // Byte Enable
    // ---------------
    // we can generate the byte enable from the virtual address since the last
    // 12 bit are the same anyway
    // and we can always generate the byte enable from the address at hand
    assign be_i = be_gen(vaddr_i[2:0], extract_transfer_size(fu_data_i.operator));

    // ------------------------
    // Misaligned Exception
    // ------------------------
    // we can detect a misaligned exception immediately
    // the misaligned exception is passed to the functional unit via the MMU, which in case
    // can augment the exception if other memory related exceptions like a page fault or access errors
    always_comb begin : data_misaligned_detection

        misaligned_exception = {
            64'b0,
            64'b0,
            1'b0
        };

        data_misaligned = 1'b0;

        if (lsu_ctrl.valid) begin
            case (lsu_ctrl.operator)
                // double word
                LD, SD, FLD, FSD,
                AMO_LRD, AMO_SCD,
                AMO_SWAPD, AMO_ADDD, AMO_ANDD, AMO_ORD,
                AMO_XORD, AMO_MAXD, AMO_MAXDU, AMO_MIND,
                AMO_MINDU: begin
                    if (lsu_ctrl.vaddr[2:0] != 3'b000) begin
                        data_misaligned = 1'b1;
                    end
                end
                // word
                LW, LWU, SW, FLW, FSW,
                AMO_LRW, AMO_SCW,
                AMO_SWAPW, AMO_ADDW, AMO_ANDW, AMO_ORW,
                AMO_XORW, AMO_MAXW, AMO_MAXWU, AMO_MINW,
                AMO_MINWU: begin
                    if (lsu_ctrl.vaddr[1:0] != 2'b00) begin
                        data_misaligned = 1'b1;
                    end
                end
                // half word
                LH, LHU, SH, FLH, FSH: begin
                    if (lsu_ctrl.vaddr[0] != 1'b0) begin
                        data_misaligned = 1'b1;
                    end
                end
                // byte -> is always aligned
                default:;
            endcase
        end

        if (data_misaligned) begin

            if (lsu_ctrl.fu == LOAD) begin
                misaligned_exception = {
                    riscv::LD_ADDR_MISALIGNED,
                    lsu_ctrl.vaddr,
                    1'b1
                };

            end else if (lsu_ctrl.fu == STORE) begin
                misaligned_exception = {
                    riscv::ST_ADDR_MISALIGNED,
                    lsu_ctrl.vaddr,
                    1'b1
                };
            end
        end

        // we work with SV39, so if VM is enabled, check that all bits [63:38] are equal
        if (en_ld_st_translation_i && !((&lsu_ctrl.vaddr[63:38]) == 1'b1 || (|lsu_ctrl.vaddr[63:38]) == 1'b0)) begin

            if (lsu_ctrl.fu == LOAD) begin
                misaligned_exception = {
                    riscv::LD_ACCESS_FAULT,
                    lsu_ctrl.vaddr,
                    1'b1
                };

            end else if (lsu_ctrl.fu == STORE) begin
                misaligned_exception = {
                    riscv::ST_ACCESS_FAULT,
                    lsu_ctrl.vaddr,
                    1'b1
                };
            end
        end
    end

    // ------------------
    // LSU Control
    // ------------------
    // new data arrives here
    lsu_ctrl_t lsu_req_i;

    assign lsu_req_i = {lsu_valid_i, vaddr_i, fu_data_i.operand_b, be_i, fu_data_i.fu, fu_data_i.operator, fu_data_i.trans_id};

    lsu_bypass lsu_bypass_i (
        .lsu_req_i          ( lsu_req_i   ),
        .lus_req_valid_i    ( lsu_valid_i ),
        .pop_ld_i           ( pop_ld      ),
        .pop_st_i           ( pop_st      ),

        .lsu_ctrl_o         ( lsu_ctrl    ),
        .ready_o            ( lsu_ready_o ),
        .*
    );

endmodule

// ------------------
// LSU Control
// ------------------
// The LSU consists of two independent block which share a common address translation block.
// The one block is the load unit, the other one is the store unit. They will signal their readiness
// with separate signals. If they are not ready the LSU control should keep the last applied signals stable.
// Furthermore it can be the case that another request for one of the two store units arrives in which case
// the LSU control should sample it and store it for later application to the units. It does so, by storing it in a
// two element FIFO. This is necessary as we only know very late in the cycle whether the load/store will succeed (address check,
// TLB hit mainly). So we better unconditionally allow another request to arrive and store this request in case we need to.
module lsu_bypass (
    input  logic      clk_i,
    input  logic      rst_ni,
    input  logic      flush_i,

    input  lsu_ctrl_t lsu_req_i,
    input  logic      lus_req_valid_i,
    input  logic      pop_ld_i,
    input  logic      pop_st_i,

    output lsu_ctrl_t lsu_ctrl_o,
    output logic      ready_o
    );

    lsu_ctrl_t [1:0] mem_n, mem_q;
    logic read_pointer_n, read_pointer_q;
    logic write_pointer_n, write_pointer_q;
    logic [1:0] status_cnt_n, status_cnt_q;

    logic  empty;
    assign empty = (status_cnt_q == 0);
    assign ready_o = empty;

    always_comb begin
        automatic logic [1:0] status_cnt;
        automatic logic write_pointer;
        automatic logic read_pointer;

        status_cnt = status_cnt_q;
        write_pointer = write_pointer_q;
        read_pointer = read_pointer_q;

        mem_n = mem_q;
        // we've got a valid LSU request
        if (lus_req_valid_i) begin
            mem_n[write_pointer_q] = lsu_req_i;
            write_pointer++;
            status_cnt++;
        end

        if (pop_ld_i) begin
            // invalidate the result
            mem_n[read_pointer_q].valid = 1'b0;
            read_pointer++;
            status_cnt--;
        end

        if (pop_st_i) begin
            // invalidate the result
            mem_n[read_pointer_q].valid = 1'b0;
            read_pointer++;
            status_cnt--;
        end

        if (pop_st_i && pop_ld_i)
            mem_n = '{default: 0};

        if (flush_i) begin
            status_cnt = '0;
            write_pointer = '0;
            read_pointer = '0;
            mem_n = '{default: 0};
        end
        // default assignments
        read_pointer_n  = read_pointer;
        write_pointer_n = write_pointer;
        status_cnt_n    = status_cnt;
    end

    // output assignment
    always_comb begin : output_assignments
        if (empty) begin
            lsu_ctrl_o = lsu_req_i;
        end else begin
            lsu_ctrl_o = mem_q[read_pointer_q];
        end
    end

    // registers
    always_ff @(posedge clk_i or negedge rst_ni) begin
        if (~rst_ni) begin
            mem_q           <= '{default: 0};
            status_cnt_q    <= '0;
            write_pointer_q <= '0;
            read_pointer_q  <= '0;
        end else begin
            mem_q           <= mem_n;
            status_cnt_q    <= status_cnt_n;
            write_pointer_q <= write_pointer_n;
            read_pointer_q  <= read_pointer_n;
        end
    end
endmodule