// Copyright 2021 ETH Zurich and University of Bologna.
// Solderpad Hardware License, Version 0.51, see LICENSE for details.
// SPDX-License-Identifier: SHL-0.51

// Samuel Riedel  <sriedel@iis.ee.ethz.ch>

`include "common_cells/registers.svh"

/// An actual cache lookup.
module snitch_icache_lookup #(
    parameter snitch_icache_pkg::config_t CFG = '0
)(
    input  logic                        clk_i,
    input  logic                        rst_ni,

    input  logic                        flush_valid_i,
    output logic                        flush_ready_o,

    input  logic [CFG.FETCH_AW-1:0]     in_addr_i,
    input  logic [CFG.ID_WIDTH_REQ-1:0] in_id_i,
    input  logic                        in_valid_i,
    output logic                        in_ready_o,

    output logic [CFG.FETCH_AW-1:0]     out_addr_o,
    output logic [CFG.ID_WIDTH_REQ-1:0] out_id_o,
    output logic [CFG.SET_ALIGN-1:0]    out_set_o,
    output logic                        out_hit_o,
    output logic [CFG.LINE_WIDTH-1:0]   out_data_o,
    output logic                        out_error_o,
    output logic                        out_valid_o,
    input  logic                        out_ready_i,

    input  logic [CFG.COUNT_ALIGN-1:0]  write_addr_i,
    input  logic [CFG.SET_ALIGN-1:0]    write_set_i,
    input  logic [CFG.LINE_WIDTH-1:0]   write_data_i,
    input  logic [CFG.TAG_WIDTH-1:0]    write_tag_i,
    input  logic                        write_error_i,
    input  logic                        write_valid_i,
    output logic                        write_ready_o
);

    localparam int unsigned DATA_ADDR_WIDTH = $clog2(CFG.SET_COUNT) + CFG.COUNT_ALIGN;

    `ifndef SYNTHESIS
    initial assert(CFG != '0);
    `endif

    logic [CFG.COUNT_ALIGN:0] init_count_q;
    logic                     init_phase;

    // We are always ready to flush
    assign flush_ready_o = 1'b1;
    assign init_phase = init_count_q != $unsigned(CFG.LINE_COUNT);
    // Initialization and flush FSM
    always_ff @(posedge clk_i, negedge rst_ni) begin
        if (!rst_ni)
            init_count_q <= '0;
        else if (init_count_q != $unsigned(CFG.LINE_COUNT))
            init_count_q <= init_count_q + 1;
        else if (flush_valid_i)
            init_count_q <= '0;
    end

    // --------------------------------------------------
    // Tag stage
    // --------------------------------------------------
    typedef struct packed {
        logic [CFG.FETCH_AW-1:0]     addr;
        logic [CFG.ID_WIDTH_REQ-1:0] id;
    } tag_req_t;

    typedef struct packed {
        logic [CFG.SET_ALIGN-1:0] cset;
        logic                     hit;
        logic                     error;
    } tag_rsp_t;

    logic                       req_valid, req_ready;
    logic                       req_handshake;

    logic [CFG.COUNT_ALIGN-1:0] tag_addr;
    logic [CFG.SET_COUNT-1:0]   tag_enable;
    logic [CFG.TAG_WIDTH+1:0]   tag_wdata, tag_rdata [CFG.SET_COUNT];
    logic                       tag_write;

    tag_req_t                   tag_req_d, tag_req_q;
    tag_rsp_t                   tag_rsp_s, tag_rsp_d, tag_rsp_q, tag_rsp;
    logic                       tag_valid, tag_ready;
    logic                       tag_handshake;

    logic [CFG.TAG_WIDTH-1:0]   required_tag;
    logic [CFG.SET_COUNT-1:0]   line_hit;

    logic [DATA_ADDR_WIDTH-1:0] lookup_addr;
    logic [DATA_ADDR_WIDTH-1:0] write_addr;

    // Connect input requests to tag stage
    assign tag_req_d.addr = in_addr_i;
    assign tag_req_d.id   = in_id_i;

    // Multiplex read and write access to the tag banks onto one port, prioritizing write accesses
    always_comb begin
        tag_addr   = in_addr_i >> CFG.LINE_ALIGN;
        tag_enable = '0;
        tag_wdata  = {1'b1, write_error_i, write_tag_i};
        tag_write  = 1'b0;

        write_ready_o = 1'b0;
        in_ready_o    = 1'b0;
        req_valid     = 1'b0;

        if (init_phase) begin
            tag_addr   = init_count_q;
            tag_enable = '1;
            tag_wdata  = '0;
            tag_write  = 1'b1;
        end else if (write_valid_i) begin
            // Write a refill request
            tag_addr      = write_addr_i;
            tag_enable    = $unsigned(1 << write_set_i);
            tag_write     = 1'b1;
            write_ready_o = 1'b1;
        end else if (in_valid_i) begin
            // Check cache
            tag_enable = '1;
            in_ready_o = req_ready;
            // Request to store data in pipeline
            req_valid  = 1'b1;
        end
    end

    // Instantiate the tag sets.
    for (genvar i = 0; i < CFG.SET_COUNT; i++) begin : g_sets
        if (CFG.L1_TAG_SCM) begin : gen_scm
            latch_scm #(
                .ADDR_WIDTH ($clog2(CFG.LINE_COUNT)),
                .DATA_WIDTH (CFG.TAG_WIDTH+2       )
            ) i_tag (
                .clk         ( clk_i                       ),
                .ReadEnable  ( tag_enable[i] && !tag_write ),
                .ReadAddr    ( tag_addr                    ),
                .ReadData    ( tag_rdata[i]                ),
                .WriteEnable ( tag_enable[i] && tag_write  ),
                .WriteAddr   ( tag_addr                    ),
                .WriteData   ( tag_wdata                   )
            );
        end else begin : gen_sram
            tc_sram #(
                .DataWidth ( CFG.TAG_WIDTH+2 ),
                .NumWords  ( CFG.LINE_COUNT  ),
                .NumPorts  ( 1               )
            ) i_tag (
                .clk_i   ( clk_i         ),
                .rst_ni  ( rst_ni        ),
                .req_i   ( tag_enable[i] ),
                .we_i    ( tag_write     ),
                .addr_i  ( tag_addr      ),
                .wdata_i ( tag_wdata     ),
                .be_i    ( '1            ),
                .rdata_o ( tag_rdata[i]  )
            );
        end
    end

    // Determine which set hit
    always_comb begin
        automatic logic [CFG.SET_COUNT-1:0] errors;
        required_tag = tag_req_q.addr >> (CFG.LINE_ALIGN + CFG.COUNT_ALIGN);
        for (int i = 0; i < CFG.SET_COUNT; i++) begin
            line_hit[i] = tag_rdata[i][CFG.TAG_WIDTH+1] && tag_rdata[i][CFG.TAG_WIDTH-1:0] == required_tag;
            errors[i] = tag_rdata[i][CFG.TAG_WIDTH] && line_hit[i];
        end
        tag_rsp_s.hit = |line_hit;
        tag_rsp_s.error = |errors;
    end

    lzc #(.WIDTH(CFG.SET_COUNT)) i_lzc (
        .in_i     ( line_hit       ),
        .cnt_o    ( tag_rsp_s.cset ),
        .empty_o  (                )
    );

    // Buffer the metadata on a valid handshake. Stall on write (implicit in req_valid/ready)
    `FFL(tag_req_q, tag_req_d, req_valid && req_ready, '0, clk_i, rst_ni)
    `FF(tag_valid, req_valid ? 1'b1 : tag_ready ? 1'b0 : tag_valid, '0, clk_i, rst_ni)
    // Ready if buffer is empy or downstream is reading. Stall on write
    assign req_ready = (!tag_valid || tag_ready) && !tag_write;

    // Register the handshake of the reg stage to buffer the tag output data in the next cycle
    `FF(req_handshake, req_valid && req_ready, 1'b0, clk_i, rst_ni)

    // Fall-through buffer the tag data: Store the tag data if the SRAM bank accepted a request in
    // the previous cycle and if we actually have to buffer them because the receiver is not ready
    `FF(tag_rsp_q, tag_rsp_d, '0, clk_i, rst_ni)
    assign tag_rsp = req_handshake ? tag_rsp_s : tag_rsp_q;
    always_comb begin
        tag_rsp_d = tag_rsp_q;
        // Load the FF if new data is incoming and downstream is not ready
        if (req_handshake && !tag_ready) begin
            tag_rsp_d = tag_rsp_s;
        end
        // Override the hit if the write that stalled us invalidated the data
        if (lookup_addr == write_addr && write_valid_i) begin
            tag_rsp_d.hit = 1'b0;
        end
    end

    // --------------------------------------------------
    // Data stage
    // --------------------------------------------------

    typedef struct packed {
        logic [CFG.FETCH_AW-1:0]     addr;
        logic [CFG.ID_WIDTH_REQ-1:0] id;
        logic [CFG.SET_ALIGN-1:0]    cset;
        logic                        hit;
        logic                        error;
    } data_req_t;

    typedef logic [CFG.LINE_WIDTH-1:0] data_rsp_t;

    logic [DATA_ADDR_WIDTH-1:0] data_addr;
    logic                       data_enable;
    data_rsp_t                  data_wdata, data_rdata;
    logic                       data_write;

    data_req_t                  data_req_d, data_req_q;
    data_rsp_t                  data_rsp_q;
    logic                       data_valid, data_ready;

    // Connect tag stage response to data stage request
    assign data_req_d.addr  = tag_req_q.addr;
    assign data_req_d.id    = tag_req_q.id;
    assign data_req_d.cset  = tag_rsp.cset;
    assign data_req_d.hit   = tag_rsp.hit;
    assign data_req_d.error = tag_rsp.error;

    assign lookup_addr = {tag_rsp.cset, tag_req_q.addr[CFG.LINE_ALIGN +: CFG.COUNT_ALIGN]};
    assign write_addr  = {write_set_i, write_addr_i};

    // Data bank port mux
    always_comb begin
        // Default read request
        data_addr   = lookup_addr;
        data_enable = tag_valid && tag_rsp.hit; // Only read data on hit
        data_wdata  = write_data_i;
        data_write  = 1'b0;
        // Write takes priority
        if (!init_phase && write_valid_i) begin
            data_addr   = write_addr;
            data_enable = 1'b1;
            data_write  = 1'b1;
        end
    end

    tc_sram #(
        .DataWidth ( CFG.LINE_WIDTH                 ),
        .NumWords  ( CFG.LINE_COUNT * CFG.SET_COUNT ),
        .NumPorts  ( 1                              )
    ) i_data (
        .clk_i   ( clk_i       ),
        .rst_ni  ( rst_ni      ),
        .req_i   ( data_enable ),
        .we_i    ( data_write  ),
        .addr_i  ( data_addr   ),
        .wdata_i ( data_wdata  ),
        .be_i    ( '1          ),
        .rdata_o ( data_rdata  )
    );

    // Buffer the metadata on a valid handshake. Stall on write (implicit in tag_ready)
    `FFL(data_req_q, data_req_d, tag_valid && tag_ready, '0, clk_i, rst_ni)
    `FF(data_valid, (tag_valid && !data_write) ? 1'b1 : data_ready ? 1'b0 : data_valid, '0, clk_i, rst_ni)
    // Ready if buffer is empy or downstream is reading. Stall on write
    assign tag_ready = (!data_valid || data_ready) && !data_write;

    // Register the handshake of the tag stage to buffer the data output data in the next cycle
    // but only if it was a hit. Otherwise, the data is not read anyway.
    `FF(tag_handshake, tag_valid && tag_ready && data_req_d.hit, 1'b0, clk_i, rst_ni)

    // Fall-through buffer the read data: Store the read data if the SRAM bank accepted a request in
    // the previous cycle and if we actually have to buffer them because the receiver is not ready
    `FFL(data_rsp_q, data_rdata, tag_handshake && !data_ready, '0, clk_i, rst_ni)
    assign out_data_o = tag_handshake ? data_rdata : data_rsp_q;

    // Generate the remaining output signals.
    assign out_addr_o  = data_req_q.addr;
    assign out_id_o    = data_req_q.id;
    assign out_set_o   = data_req_q.cset;
    assign out_hit_o   = data_req_q.hit;
    assign out_error_o = data_req_q.error;
    assign out_valid_o = data_valid;
    assign data_ready  = out_ready_i;

endmodule