serpent_dcache_mem.sv 16.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373
// Copyright 2018 ETH Zurich and University of Bologna.
// Copyright and related rights are licensed under the Solderpad Hardware
// License, Version 0.51 (the "License"); you may not use this file except in
// compliance with the License.  You may obtain a copy of the License at
// http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
// or agreed to in writing, software, hardware and materials distributed under
// this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
//
// Author: Michael Schaffner <schaffner@iis.ee.ethz.ch>, ETH Zurich
// Date: 13.09.2018
// Description: Memory arrays, arbiter and tag comparison for serpent dcache.
//
//
// Notes: 1) all ports can trigger a readout of all ways, and the way where the tag hits is selected
//  
//        2) only port0 can write full cache lines. higher ports are read only. also, port0 can only read the tag array,
//           and does not trigger a cache line readout.
//
//        3) the single word write port is a separate port without access to the tag memory.
//           these single word writes can interleave with read operations if they go to different 
//           cacheline offsets, since each word offset is placed into a different SRAM bank.
//
//        4) Read ports with same priority are RR arbited. but high prio ports (rd_prio_i[port_nr] = '1b1) will stall
//           low prio ports (rd_prio_i[port_nr] = '1b0)

import ariane_pkg::*;
import serpent_cache_pkg::*;

module serpent_dcache_mem #(
        parameter int unsigned NumPorts     = 3
    )(
        input  logic                                              clk_i,
        input  logic                                              rst_ni,
        
        // ports
        input  logic  [NumPorts-1:0][DCACHE_TAG_WIDTH-1:0]        rd_tag_i,           // tag in - comes one cycle later
        input  logic  [NumPorts-1:0][DCACHE_CL_IDX_WIDTH-1:0]     rd_idx_i,     
        input  logic  [NumPorts-1:0][DCACHE_OFFSET_WIDTH-1:0]     rd_off_i,     
        input  logic  [NumPorts-1:0]                              rd_req_i,           // read the word at offset off_i[:3] in all ways
        input  logic  [NumPorts-1:0]                              rd_tag_only_i,      // only do a tag/valid lookup, no access to data arrays
        input  logic  [NumPorts-1:0]                              rd_prio_i,          // 0: low prio, 1: high prio
        output logic  [NumPorts-1:0]                              rd_ack_o,     
        output logic                [DCACHE_SET_ASSOC-1:0]        rd_vld_bits_o,
        output logic                [DCACHE_SET_ASSOC-1:0]        rd_hit_oh_o,
        output logic                [63:0]                        rd_data_o,
        
        // only available on port 0, uses address signals of port 0    
        input  logic                                              wr_cl_vld_i,
        input  logic                                              wr_cl_nc_i,         // noncacheable access
        input  logic                [DCACHE_SET_ASSOC-1:0]        wr_cl_we_i,         // writes a full cacheline 
        input  logic                [DCACHE_TAG_WIDTH-1:0]        wr_cl_tag_i,
        input  logic                [DCACHE_CL_IDX_WIDTH-1:0]     wr_cl_idx_i,
        input  logic                [DCACHE_OFFSET_WIDTH-1:0]     wr_cl_off_i,
        input  logic                [DCACHE_LINE_WIDTH-1:0]       wr_cl_data_i, 
        input  logic                [DCACHE_LINE_WIDTH/8-1:0]     wr_cl_data_be_i, 
        input  logic                [DCACHE_SET_ASSOC-1:0]        wr_vld_bits_i,      
        
        // separate port for single word write, no tag access
        input  logic                [DCACHE_SET_ASSOC-1:0]        wr_req_i,           // write a single word to offset off_i[:3] 
        output logic                                              wr_ack_o,
        input  logic                [DCACHE_CL_IDX_WIDTH-1:0]     wr_idx_i,     
        input  logic                [DCACHE_OFFSET_WIDTH-1:0]     wr_off_i,     
        input  logic                [63:0]                        wr_data_i,      
        input  logic                [7:0]                         wr_data_be_i,

        // forwarded wbuffer
        input wbuffer_t             [DCACHE_WBUF_DEPTH-1:0]       wbuffer_data_i
    );


    logic [DCACHE_NUM_BANKS-1:0]                                  bank_req;
    logic [DCACHE_NUM_BANKS-1:0]                                  bank_we;
    logic [DCACHE_NUM_BANKS-1:0][DCACHE_SET_ASSOC-1:0][7:0]       bank_be;
    logic [DCACHE_NUM_BANKS-1:0][DCACHE_CL_IDX_WIDTH-1:0]         bank_idx;
    logic [DCACHE_CL_IDX_WIDTH-1:0]                               bank_idx_d, bank_idx_q;
    logic [DCACHE_OFFSET_WIDTH-1:0]                               bank_off_d, bank_off_q;
     
    logic [DCACHE_NUM_BANKS-1:0][DCACHE_SET_ASSOC-1:0][63:0]      bank_wdata;                   // 
    logic [DCACHE_NUM_BANKS-1:0][DCACHE_SET_ASSOC-1:0][63:0]      bank_rdata;                   // 
    logic [DCACHE_SET_ASSOC-1:0][63:0]                            rdata_cl;                     // selected word from each cacheline
 
    logic [DCACHE_TAG_WIDTH-1:0]                                  rd_tag;
    logic [DCACHE_SET_ASSOC-1:0]                                  vld_req;                      // bit enable for valid regs
    logic                                                         vld_we;                       // valid bits write enable
    logic [DCACHE_SET_ASSOC-1:0]                                  vld_wdata;                    // valid bits to write
    logic [DCACHE_SET_ASSOC-1:0][DCACHE_TAG_WIDTH-1:0]            tag_rdata;                    // these are the tags coming from the tagmem
    logic                       [DCACHE_CL_IDX_WIDTH-1:0]         vld_addr;                     // valid bit 
    
    logic [$clog2(NumPorts)-1:0]                                  vld_sel_d, vld_sel_q;
    
    logic [DCACHE_WBUF_DEPTH-1:0]                                 wbuffer_hit_oh;
    logic [7:0]                                                   wbuffer_be;
    logic [63:0]                                                  wbuffer_rdata, rdata;
    logic [63:0]                                                  wbuffer_cmp_addr;
    
    logic                                                         cmp_en_d, cmp_en_q;
    logic                                                         rd_acked;
    logic [NumPorts-1:0]                                          bank_collision, rd_req_masked, rd_req_prio;

///////////////////////////////////////////////////////
// arbiter
///////////////////////////////////////////////////////

    // Priority is highest for lowest read port index
    //
    // SRAM bank mapping:
    //
    // Bank 0                   Bank 2  
    // [way0, w0] [way1, w0] .. [way0, w1] [way1, w1] .. 

    // byte enable mapping
    generate 
        for (genvar k=0;k<DCACHE_NUM_BANKS;k++) begin : g_bank
            for (genvar j=0;j<DCACHE_SET_ASSOC;j++) begin : g_bank_way
                assign bank_be[k][j]   = (wr_cl_we_i[j] & wr_cl_vld_i) ? wr_cl_data_be_i[k*8 +: 8] : 
                                         (wr_req_i[j]   & wr_ack_o)    ? wr_data_be_i              : 
                                                                         '0;
                
                assign bank_wdata[k][j] = (wr_cl_vld_i) ?  wr_cl_data_i[k*64 +: 64] :
                                                           wr_data_i;    
            end
        end
    endgenerate

    assign vld_wdata  = wr_vld_bits_i;
    assign vld_addr   = (wr_cl_vld_i) ? wr_cl_idx_i   : rd_idx_i[vld_sel_d];
    assign rd_tag     = rd_tag_i[vld_sel_q]; //delayed by one cycle
    assign bank_off_d = (wr_cl_vld_i) ? wr_cl_off_i   : rd_off_i[vld_sel_d];
    assign bank_idx_d = (wr_cl_vld_i) ? wr_cl_idx_i   : rd_idx_i[vld_sel_d];
    assign vld_req    = (wr_cl_vld_i) ? wr_cl_we_i    : (rd_acked) ? '1 : '0;  


    // priority masking
    // disable low prio requests when any of the high prio reqs is present
    assign rd_req_prio   = rd_req_i & rd_prio_i;
    assign rd_req_masked = (|rd_req_prio) ? rd_req_prio : rd_req_i;
    
    // read port arbiter
    rrarbiter #(
        .NUM_REQ(NumPorts)
    ) i_rrarbiter (
        .clk_i  ( clk_i         ),
        .rst_ni ( rst_ni        ),
        .flush_i( 1'b0          ),
        .en_i   ( ~wr_cl_vld_i  ),
        .req_i  ( rd_req_masked ),
        .ack_o  ( rd_ack_o      ),
        .vld_o  ( rd_acked      ),
        .idx_o  ( vld_sel_d     )
    );
    
    always_comb begin : p_bank_req
        
        vld_we   = wr_cl_vld_i;
        bank_req = '0;
        wr_ack_o = '0;
        bank_we  = '0;
        bank_idx = '{default:wr_idx_i};

        for(int k=0; k<NumPorts; k++) begin
            bank_collision[k] = rd_off_i[k][DCACHE_OFFSET_WIDTH-1:3] == wr_off_i[DCACHE_OFFSET_WIDTH-1:3];
        end    

        if(wr_cl_vld_i & |wr_cl_we_i) begin
            bank_req = '1;
            bank_we  = '1;
            bank_idx = '{default:wr_cl_idx_i};
        end else begin
            if(rd_acked) begin
                if(~rd_tag_only_i[vld_sel_d]) begin
                    bank_req                                               = dcache_cl_bin2oh(rd_off_i[vld_sel_d][DCACHE_OFFSET_WIDTH-1:3]);
                    bank_idx[rd_off_i[vld_sel_d][DCACHE_OFFSET_WIDTH-1:3]] = rd_idx_i[vld_sel_d]; 
                end
            end        
        
            if(|wr_req_i) begin
                if(rd_tag_only_i[vld_sel_d] | ~(rd_ack_o[vld_sel_d] & bank_collision[vld_sel_d])) begin
                    wr_ack_o = 1'b1;
                    bank_req |= dcache_cl_bin2oh(wr_off_i[DCACHE_OFFSET_WIDTH-1:3]);
                    bank_we   = dcache_cl_bin2oh(wr_off_i[DCACHE_OFFSET_WIDTH-1:3]);
                end
            end  
        end        
    end

///////////////////////////////////////////////////////
// tag comparison, hit generatio, readoud muxes
///////////////////////////////////////////////////////
    
    logic [DCACHE_OFFSET_WIDTH-1:0]       wr_cl_off;
    logic [$clog2(DCACHE_WBUF_DEPTH)-1:0] wbuffer_hit_idx;
    logic [$clog2(DCACHE_SET_ASSOC)-1:0]  rd_hit_idx;

    assign cmp_en_d = (|vld_req) & ~vld_we;

    // word tag comparison in write buffer
    assign wbuffer_cmp_addr = (wr_cl_vld_i) ? {wr_cl_tag_i, wr_cl_idx_i, wr_cl_off_i} : 
                                              {rd_tag, bank_idx_q, bank_off_q};
    // hit generation
    generate 
        for (genvar i=0;i<DCACHE_SET_ASSOC;i++) begin : g_tag_cmpsel
            // tag comparison of ways >0
            assign rd_hit_oh_o[i] = (rd_tag == tag_rdata[i]) & rd_vld_bits_o[i]  & cmp_en_q;
            // byte offset mux of ways >0
            assign rdata_cl[i] = bank_rdata[bank_off_q[DCACHE_OFFSET_WIDTH-1:3]][i];
        end

        for(genvar k=0; k<DCACHE_WBUF_DEPTH; k++) begin : g_wbuffer_hit
            assign wbuffer_hit_oh[k] = (|wbuffer_data_i[k].valid) & (wbuffer_data_i[k].wtag == (wbuffer_cmp_addr >> 3));
        end    
    endgenerate


    lzc #(
        .WIDTH ( DCACHE_WBUF_DEPTH )
    ) i_lzc_wbuffer_hit (
        .in_i    ( wbuffer_hit_oh   ),
        .cnt_o   ( wbuffer_hit_idx  ),
        .empty_o (                  )
    );
    
    lzc #(
        .WIDTH ( DCACHE_SET_ASSOC )
    ) i_lzc_rd_hit (
        .in_i    ( rd_hit_oh_o  ),
        .cnt_o   ( rd_hit_idx   ),
        .empty_o (              )
    );

    assign wbuffer_rdata = wbuffer_data_i[wbuffer_hit_idx].data; 
    assign wbuffer_be    = (|wbuffer_hit_oh) ? wbuffer_data_i[wbuffer_hit_idx].valid : '0;
    
    assign wr_cl_off     = (wr_cl_nc_i)     ? '0 : wr_cl_off_i[DCACHE_OFFSET_WIDTH-1:3];
    assign rdata         = (wr_cl_vld_i)    ? wr_cl_data_i[wr_cl_off*64 +: 64] :
                                              rdata_cl[rd_hit_idx];

    // overlay bytes that hit in the write buffer
    generate  
        for(genvar k=0; k<8; k++) begin : g_rd_data
            assign rd_data_o[8*k +: 8] = (wbuffer_be[k]) ? wbuffer_rdata[8*k +: 8] : rdata[8*k +: 8];
        end
    endgenerate    

    

///////////////////////////////////////////////////////
// memory arrays and regs
///////////////////////////////////////////////////////

    logic [DCACHE_TAG_WIDTH:0] vld_tag_rdata [DCACHE_SET_ASSOC-1:0];

    generate
        for (genvar k = 0; k < DCACHE_NUM_BANKS; k++) begin : g_data_banks
            // Data RAM
            sram #(
                .DATA_WIDTH ( ariane_pkg::DCACHE_SET_ASSOC * 64 ),
                .NUM_WORDS  ( serpent_cache_pkg::DCACHE_NUM_WORDS    )
            ) i_data_sram (
                .clk_i      ( clk_i               ),
                .rst_ni     ( rst_ni              ),
                .req_i      ( bank_req   [k]      ),
                .we_i       ( bank_we    [k]      ),
                .addr_i     ( bank_idx   [k]      ),
                .wdata_i    ( bank_wdata [k]      ),
                .be_i       ( bank_be    [k]      ),
                .rdata_o    ( bank_rdata [k]      )
            );
        end

        for (genvar i = 0; i < DCACHE_SET_ASSOC; i++) begin : g_tag_srams

            assign tag_rdata[i]     = vld_tag_rdata[i][DCACHE_TAG_WIDTH-1:0];
            assign rd_vld_bits_o[i] = vld_tag_rdata[i][DCACHE_TAG_WIDTH];    
        
            // Tag RAM
            sram #(
                // tag + valid bit
                .DATA_WIDTH ( ariane_pkg::DCACHE_TAG_WIDTH + 1 ),
                .NUM_WORDS  ( serpent_cache_pkg::DCACHE_NUM_WORDS   )
            ) i_tag_sram (
                .clk_i     ( clk_i               ),
                .rst_ni    ( rst_ni              ),
                .req_i     ( vld_req[i]          ),
                .we_i      ( vld_we              ),
                .addr_i    ( vld_addr            ),
                .wdata_i   ( {vld_wdata[i], wr_cl_tag_i} ),
                .be_i      ( '1                  ),
                .rdata_o   ( vld_tag_rdata[i]    )
            );
        end
    endgenerate    


    always_ff @(posedge clk_i or negedge rst_ni) begin : p_regs
        if(~rst_ni) begin
            bank_idx_q <= '0;
            bank_off_q <= '0;
            vld_sel_q  <= '0;
            cmp_en_q   <= '0;
        end else begin
            bank_idx_q <= bank_idx_d;
            bank_off_q <= bank_off_d;
            vld_sel_q  <= vld_sel_d ;
            cmp_en_q   <= cmp_en_d;
        end
    end



///////////////////////////////////////////////////////
// assertions
///////////////////////////////////////////////////////

//pragma translate_off
`ifndef VERILATOR

    hit_hot1: assert property (
        @(posedge clk_i) disable iff (~rst_ni) &vld_req |-> ~vld_we |=> $onehot0(rd_hit_oh_o))     
            else $fatal(1,"[l1 dcache] rd_hit_oh_o signal must be hot1");

    word_write_hot1: assert property (
        @(posedge clk_i) disable iff (~rst_ni) wr_ack_o |-> $onehot0(wr_req_i))     
            else $fatal(1,"[l1 dcache] wr_req_i signal must be hot1");

    wbuffer_hit_hot1: assert property (
        @(posedge clk_i) disable iff (~rst_ni) &vld_req |-> ~vld_we |=> $onehot0(wbuffer_hit_oh))     
            else $fatal(1,"[l1 dcache] wbuffer_hit_oh signal must be hot1");

    // this is only used for verification!
    logic                                    vld_mirror[serpent_cache_pkg::DCACHE_NUM_WORDS-1:0][ariane_pkg::DCACHE_SET_ASSOC-1:0];        
    logic [ariane_pkg::DCACHE_TAG_WIDTH-1:0] tag_mirror[serpent_cache_pkg::DCACHE_NUM_WORDS-1:0][ariane_pkg::DCACHE_SET_ASSOC-1:0];        
    logic [ariane_pkg::DCACHE_SET_ASSOC-1:0] tag_write_duplicate_test;

    always_ff @(posedge clk_i or negedge rst_ni) begin : p_mirror
        if(~rst_ni) begin
            vld_mirror <= '{default:'0};
            tag_mirror <= '{default:'0};
        end else begin
            for (int i = 0; i < DCACHE_SET_ASSOC; i++) begin
                if(vld_req[i] & vld_we) begin
                    vld_mirror[vld_addr][i] <= vld_wdata[i];
                    tag_mirror[vld_addr][i] <= wr_cl_tag_i;
                end 
            end       
        end
    end

    generate
        for (genvar i = 0; i < DCACHE_SET_ASSOC; i++) begin
            assign tag_write_duplicate_test[i] = (tag_mirror[vld_addr][i] == wr_cl_tag_i) & vld_mirror[vld_addr][i] & (|vld_wdata);
        end 
    endgenerate

    tag_write_duplicate: assert property (
        @(posedge clk_i) disable iff (~rst_ni) |vld_req |-> vld_we |-> ~(|tag_write_duplicate_test))     
            else $fatal(1,"[l1 dcache] cannot allocate a CL that is already present in the cache");

    // logic tst;
    // always_comb begin : p_test
    //     tst = tag == 44'h13;
    //     // for (int k=0; k<DCACHE_SET_ASSOC;k++) begin
    //     //     tst |= tag_rdata[k] == 44'h96;
    //     // end    
    //     tst &= bank_idx_d == 64'h0C;
    //     tst &= |wr_cl_we_i;
    // end

`endif
//pragma translate_on

endmodule // serpent_dcache_mem