// Copyright 2018 ETH Zurich and University of Bologna. // Copyright and related rights are licensed under the Solderpad Hardware // License, Version 0.51 (the "License"); you may not use this file except in // compliance with the License. You may obtain a copy of the License at // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law // or agreed to in writing, software, hardware and materials distributed under // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. // // Author: Michael Schaffner <schaffner@iis.ee.ethz.ch>, ETH Zurich // Date: 13.09.2018 // Description: Memory arrays, arbiter and tag comparison for serpent dcache. // // // Notes: 1) all ports can trigger a readout of all ways, and the way where the tag hits is selected // // 2) only port0 can write full cache lines. higher ports are read only. also, port0 can only read the tag array, // and does not trigger a cache line readout. // // 3) the single word write port is a separate port without access to the tag memory. // these single word writes can interleave with read operations if they go to different // cacheline offsets, since each word offset is placed into a different SRAM bank. // // 4) Read ports with same priority are RR arbited. but high prio ports (rd_prio_i[port_nr] = '1b1) will stall // low prio ports (rd_prio_i[port_nr] = '1b0) import ariane_pkg::*; import serpent_cache_pkg::*; module serpent_dcache_mem #( parameter int unsigned NumPorts = 3 )( input logic clk_i, input logic rst_ni, // ports input logic [NumPorts-1:0][DCACHE_TAG_WIDTH-1:0] rd_tag_i, // tag in - comes one cycle later input logic [NumPorts-1:0][DCACHE_CL_IDX_WIDTH-1:0] rd_idx_i, input logic [NumPorts-1:0][DCACHE_OFFSET_WIDTH-1:0] rd_off_i, input logic [NumPorts-1:0] rd_req_i, // read the word at offset off_i[:3] in all ways input logic [NumPorts-1:0] rd_tag_only_i, // only do a tag/valid lookup, no access to data arrays input logic [NumPorts-1:0] rd_prio_i, // 0: low prio, 1: high prio output logic [NumPorts-1:0] rd_ack_o, output logic [DCACHE_SET_ASSOC-1:0] rd_vld_bits_o, output logic [DCACHE_SET_ASSOC-1:0] rd_hit_oh_o, output logic [63:0] rd_data_o, // only available on port 0, uses address signals of port 0 input logic wr_cl_vld_i, input logic wr_cl_nc_i, // noncacheable access input logic [DCACHE_SET_ASSOC-1:0] wr_cl_we_i, // writes a full cacheline input logic [DCACHE_TAG_WIDTH-1:0] wr_cl_tag_i, input logic [DCACHE_CL_IDX_WIDTH-1:0] wr_cl_idx_i, input logic [DCACHE_OFFSET_WIDTH-1:0] wr_cl_off_i, input logic [DCACHE_LINE_WIDTH-1:0] wr_cl_data_i, input logic [DCACHE_LINE_WIDTH/8-1:0] wr_cl_data_be_i, input logic [DCACHE_SET_ASSOC-1:0] wr_vld_bits_i, // separate port for single word write, no tag access input logic [DCACHE_SET_ASSOC-1:0] wr_req_i, // write a single word to offset off_i[:3] output logic wr_ack_o, input logic [DCACHE_CL_IDX_WIDTH-1:0] wr_idx_i, input logic [DCACHE_OFFSET_WIDTH-1:0] wr_off_i, input logic [63:0] wr_data_i, input logic [7:0] wr_data_be_i, // forwarded wbuffer input wbuffer_t [DCACHE_WBUF_DEPTH-1:0] wbuffer_data_i ); logic [DCACHE_NUM_BANKS-1:0] bank_req; logic [DCACHE_NUM_BANKS-1:0] bank_we; logic [DCACHE_NUM_BANKS-1:0][DCACHE_SET_ASSOC-1:0][7:0] bank_be; logic [DCACHE_NUM_BANKS-1:0][DCACHE_CL_IDX_WIDTH-1:0] bank_idx; logic [DCACHE_CL_IDX_WIDTH-1:0] bank_idx_d, bank_idx_q; logic [DCACHE_OFFSET_WIDTH-1:0] bank_off_d, bank_off_q; logic [DCACHE_NUM_BANKS-1:0][DCACHE_SET_ASSOC-1:0][63:0] bank_wdata; // logic [DCACHE_NUM_BANKS-1:0][DCACHE_SET_ASSOC-1:0][63:0] bank_rdata; // logic [DCACHE_SET_ASSOC-1:0][63:0] rdata_cl; // selected word from each cacheline logic [DCACHE_TAG_WIDTH-1:0] rd_tag; logic [DCACHE_SET_ASSOC-1:0] vld_req; // bit enable for valid regs logic vld_we; // valid bits write enable logic [DCACHE_SET_ASSOC-1:0] vld_wdata; // valid bits to write logic [DCACHE_SET_ASSOC-1:0][DCACHE_TAG_WIDTH-1:0] tag_rdata; // these are the tags coming from the tagmem logic [DCACHE_CL_IDX_WIDTH-1:0] vld_addr; // valid bit logic [$clog2(NumPorts)-1:0] vld_sel_d, vld_sel_q; logic [DCACHE_WBUF_DEPTH-1:0] wbuffer_hit_oh; logic [7:0] wbuffer_be; logic [63:0] wbuffer_rdata, rdata; logic [63:0] wbuffer_cmp_addr; logic cmp_en_d, cmp_en_q; logic rd_acked; logic [NumPorts-1:0] bank_collision, rd_req_masked, rd_req_prio; /////////////////////////////////////////////////////// // arbiter /////////////////////////////////////////////////////// // Priority is highest for lowest read port index // // SRAM bank mapping: // // Bank 0 Bank 2 // [way0, w0] [way1, w0] .. [way0, w1] [way1, w1] .. // byte enable mapping generate for (genvar k=0;k<DCACHE_NUM_BANKS;k++) begin : g_bank for (genvar j=0;j<DCACHE_SET_ASSOC;j++) begin : g_bank_way assign bank_be[k][j] = (wr_cl_we_i[j] & wr_cl_vld_i) ? wr_cl_data_be_i[k*8 +: 8] : (wr_req_i[j] & wr_ack_o) ? wr_data_be_i : '0; assign bank_wdata[k][j] = (wr_cl_vld_i) ? wr_cl_data_i[k*64 +: 64] : wr_data_i; end end endgenerate assign vld_wdata = wr_vld_bits_i; assign vld_addr = (wr_cl_vld_i) ? wr_cl_idx_i : rd_idx_i[vld_sel_d]; assign rd_tag = rd_tag_i[vld_sel_q]; //delayed by one cycle assign bank_off_d = (wr_cl_vld_i) ? wr_cl_off_i : rd_off_i[vld_sel_d]; assign bank_idx_d = (wr_cl_vld_i) ? wr_cl_idx_i : rd_idx_i[vld_sel_d]; assign vld_req = (wr_cl_vld_i) ? wr_cl_we_i : (rd_acked) ? '1 : '0; // priority masking // disable low prio requests when any of the high prio reqs is present assign rd_req_prio = rd_req_i & rd_prio_i; assign rd_req_masked = (|rd_req_prio) ? rd_req_prio : rd_req_i; // read port arbiter rrarbiter #( .NUM_REQ(NumPorts) ) i_rrarbiter ( .clk_i ( clk_i ), .rst_ni ( rst_ni ), .flush_i( 1'b0 ), .en_i ( ~wr_cl_vld_i ), .req_i ( rd_req_masked ), .ack_o ( rd_ack_o ), .vld_o ( rd_acked ), .idx_o ( vld_sel_d ) ); always_comb begin : p_bank_req vld_we = wr_cl_vld_i; bank_req = '0; wr_ack_o = '0; bank_we = '0; bank_idx = '{default:wr_idx_i}; for(int k=0; k<NumPorts; k++) begin bank_collision[k] = rd_off_i[k][DCACHE_OFFSET_WIDTH-1:3] == wr_off_i[DCACHE_OFFSET_WIDTH-1:3]; end if(wr_cl_vld_i & |wr_cl_we_i) begin bank_req = '1; bank_we = '1; bank_idx = '{default:wr_cl_idx_i}; end else begin if(rd_acked) begin if(~rd_tag_only_i[vld_sel_d]) begin bank_req = dcache_cl_bin2oh(rd_off_i[vld_sel_d][DCACHE_OFFSET_WIDTH-1:3]); bank_idx[rd_off_i[vld_sel_d][DCACHE_OFFSET_WIDTH-1:3]] = rd_idx_i[vld_sel_d]; end end if(|wr_req_i) begin if(rd_tag_only_i[vld_sel_d] | ~(rd_ack_o[vld_sel_d] & bank_collision[vld_sel_d])) begin wr_ack_o = 1'b1; bank_req |= dcache_cl_bin2oh(wr_off_i[DCACHE_OFFSET_WIDTH-1:3]); bank_we = dcache_cl_bin2oh(wr_off_i[DCACHE_OFFSET_WIDTH-1:3]); end end end end /////////////////////////////////////////////////////// // tag comparison, hit generatio, readoud muxes /////////////////////////////////////////////////////// logic [DCACHE_OFFSET_WIDTH-1:0] wr_cl_off; logic [$clog2(DCACHE_WBUF_DEPTH)-1:0] wbuffer_hit_idx; logic [$clog2(DCACHE_SET_ASSOC)-1:0] rd_hit_idx; assign cmp_en_d = (|vld_req) & ~vld_we; // word tag comparison in write buffer assign wbuffer_cmp_addr = (wr_cl_vld_i) ? {wr_cl_tag_i, wr_cl_idx_i, wr_cl_off_i} : {rd_tag, bank_idx_q, bank_off_q}; // hit generation generate for (genvar i=0;i<DCACHE_SET_ASSOC;i++) begin : g_tag_cmpsel // tag comparison of ways >0 assign rd_hit_oh_o[i] = (rd_tag == tag_rdata[i]) & rd_vld_bits_o[i] & cmp_en_q; // byte offset mux of ways >0 assign rdata_cl[i] = bank_rdata[bank_off_q[DCACHE_OFFSET_WIDTH-1:3]][i]; end for(genvar k=0; k<DCACHE_WBUF_DEPTH; k++) begin : g_wbuffer_hit assign wbuffer_hit_oh[k] = (|wbuffer_data_i[k].valid) & (wbuffer_data_i[k].wtag == (wbuffer_cmp_addr >> 3)); end endgenerate lzc #( .WIDTH ( DCACHE_WBUF_DEPTH ) ) i_lzc_wbuffer_hit ( .in_i ( wbuffer_hit_oh ), .cnt_o ( wbuffer_hit_idx ), .empty_o ( ) ); lzc #( .WIDTH ( DCACHE_SET_ASSOC ) ) i_lzc_rd_hit ( .in_i ( rd_hit_oh_o ), .cnt_o ( rd_hit_idx ), .empty_o ( ) ); assign wbuffer_rdata = wbuffer_data_i[wbuffer_hit_idx].data; assign wbuffer_be = (|wbuffer_hit_oh) ? wbuffer_data_i[wbuffer_hit_idx].valid : '0; assign wr_cl_off = (wr_cl_nc_i) ? '0 : wr_cl_off_i[DCACHE_OFFSET_WIDTH-1:3]; assign rdata = (wr_cl_vld_i) ? wr_cl_data_i[wr_cl_off*64 +: 64] : rdata_cl[rd_hit_idx]; // overlay bytes that hit in the write buffer generate for(genvar k=0; k<8; k++) begin : g_rd_data assign rd_data_o[8*k +: 8] = (wbuffer_be[k]) ? wbuffer_rdata[8*k +: 8] : rdata[8*k +: 8]; end endgenerate /////////////////////////////////////////////////////// // memory arrays and regs /////////////////////////////////////////////////////// logic [DCACHE_TAG_WIDTH:0] vld_tag_rdata [DCACHE_SET_ASSOC-1:0]; generate for (genvar k = 0; k < DCACHE_NUM_BANKS; k++) begin : g_data_banks // Data RAM sram #( .DATA_WIDTH ( ariane_pkg::DCACHE_SET_ASSOC * 64 ), .NUM_WORDS ( serpent_cache_pkg::DCACHE_NUM_WORDS ) ) i_data_sram ( .clk_i ( clk_i ), .rst_ni ( rst_ni ), .req_i ( bank_req [k] ), .we_i ( bank_we [k] ), .addr_i ( bank_idx [k] ), .wdata_i ( bank_wdata [k] ), .be_i ( bank_be [k] ), .rdata_o ( bank_rdata [k] ) ); end for (genvar i = 0; i < DCACHE_SET_ASSOC; i++) begin : g_tag_srams assign tag_rdata[i] = vld_tag_rdata[i][DCACHE_TAG_WIDTH-1:0]; assign rd_vld_bits_o[i] = vld_tag_rdata[i][DCACHE_TAG_WIDTH]; // Tag RAM sram #( // tag + valid bit .DATA_WIDTH ( ariane_pkg::DCACHE_TAG_WIDTH + 1 ), .NUM_WORDS ( serpent_cache_pkg::DCACHE_NUM_WORDS ) ) i_tag_sram ( .clk_i ( clk_i ), .rst_ni ( rst_ni ), .req_i ( vld_req[i] ), .we_i ( vld_we ), .addr_i ( vld_addr ), .wdata_i ( {vld_wdata[i], wr_cl_tag_i} ), .be_i ( '1 ), .rdata_o ( vld_tag_rdata[i] ) ); end endgenerate always_ff @(posedge clk_i or negedge rst_ni) begin : p_regs if(~rst_ni) begin bank_idx_q <= '0; bank_off_q <= '0; vld_sel_q <= '0; cmp_en_q <= '0; end else begin bank_idx_q <= bank_idx_d; bank_off_q <= bank_off_d; vld_sel_q <= vld_sel_d ; cmp_en_q <= cmp_en_d; end end /////////////////////////////////////////////////////// // assertions /////////////////////////////////////////////////////// //pragma translate_off `ifndef VERILATOR hit_hot1: assert property ( @(posedge clk_i) disable iff (~rst_ni) &vld_req |-> ~vld_we |=> $onehot0(rd_hit_oh_o)) else $fatal(1,"[l1 dcache] rd_hit_oh_o signal must be hot1"); word_write_hot1: assert property ( @(posedge clk_i) disable iff (~rst_ni) wr_ack_o |-> $onehot0(wr_req_i)) else $fatal(1,"[l1 dcache] wr_req_i signal must be hot1"); wbuffer_hit_hot1: assert property ( @(posedge clk_i) disable iff (~rst_ni) &vld_req |-> ~vld_we |=> $onehot0(wbuffer_hit_oh)) else $fatal(1,"[l1 dcache] wbuffer_hit_oh signal must be hot1"); // this is only used for verification! logic vld_mirror[serpent_cache_pkg::DCACHE_NUM_WORDS-1:0][ariane_pkg::DCACHE_SET_ASSOC-1:0]; logic [ariane_pkg::DCACHE_TAG_WIDTH-1:0] tag_mirror[serpent_cache_pkg::DCACHE_NUM_WORDS-1:0][ariane_pkg::DCACHE_SET_ASSOC-1:0]; logic [ariane_pkg::DCACHE_SET_ASSOC-1:0] tag_write_duplicate_test; always_ff @(posedge clk_i or negedge rst_ni) begin : p_mirror if(~rst_ni) begin vld_mirror <= '{default:'0}; tag_mirror <= '{default:'0}; end else begin for (int i = 0; i < DCACHE_SET_ASSOC; i++) begin if(vld_req[i] & vld_we) begin vld_mirror[vld_addr][i] <= vld_wdata[i]; tag_mirror[vld_addr][i] <= wr_cl_tag_i; end end end end generate for (genvar i = 0; i < DCACHE_SET_ASSOC; i++) begin assign tag_write_duplicate_test[i] = (tag_mirror[vld_addr][i] == wr_cl_tag_i) & vld_mirror[vld_addr][i] & (|vld_wdata); end endgenerate tag_write_duplicate: assert property ( @(posedge clk_i) disable iff (~rst_ni) |vld_req |-> vld_we |-> ~(|tag_write_duplicate_test)) else $fatal(1,"[l1 dcache] cannot allocate a CL that is already present in the cache"); // logic tst; // always_comb begin : p_test // tst = tag == 44'h13; // // for (int k=0; k<DCACHE_SET_ASSOC;k++) begin // // tst |= tag_rdata[k] == 44'h96; // // end // tst &= bank_idx_d == 64'h0C; // tst &= |wr_cl_we_i; // end `endif //pragma translate_on endmodule // serpent_dcache_mem