// Copyright 2018 ETH Zurich and University of Bologna. // Copyright and related rights are licensed under the Solderpad Hardware // License, Version 0.51 (the "License"); you may not use this file except in // compliance with the License. You may obtain a copy of the License at // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law // or agreed to in writing, software, hardware and materials distributed under // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. // // Author: Michael Schaffner <schaffner@iis.ee.ethz.ch>, ETH Zurich // Date: 15.08.2018 // Description: Instruction cache that is compatible with openpiton. // // Some notes: // // 1) refills always have the size of one cache line, except for accesses to the I/O region, which is mapped // to the top half of the physical address space (bit 39 = 1). the data width of the interface has the width // of one cache line, and hence the ifills can be transferred in a single cycle. note that the ifills must be // consumed unconditionally. // // 2) instruction fetches are always assumed to be aligned to 32bit (lower 2 bits are ignored) // // 3) NC accesses to I/O space are expected to return 32bit from memory. // import ariane_pkg::*; import serpent_cache_pkg::*; module serpent_icache #( parameter logic [DCACHE_ID_WIDTH-1:0] RdTxId = 0, // ID to be used for read transactions parameter bit Axi64BitCompliant = 1'b0, // set this to 1 when using in conjunction with 64bit AXI bus adapter parameter logic [63:0] CachedAddrBeg = 64'h00_8000_0000, // begin of cached region parameter logic [63:0] CachedAddrEnd = 64'h80_0000_0000 // end of cached region ) ( input logic clk_i, input logic rst_ni, input logic flush_i, // flush the icache, flush and kill have to be asserted together input logic en_i, // enable icache output logic miss_o, // to performance counter // address translation requests input icache_areq_i_t areq_i, output icache_areq_o_t areq_o, // data requests input icache_dreq_i_t dreq_i, output icache_dreq_o_t dreq_o, // refill port input logic mem_rtrn_vld_i, input icache_rtrn_t mem_rtrn_i, output logic mem_data_req_o, input logic mem_data_ack_i, output icache_req_t mem_data_o ); // signals logic cache_en_d, cache_en_q; // cache is enabled logic [63:0] vaddr_d, vaddr_q; logic paddr_is_nc; // asserted if physical address is non-cacheable logic [ICACHE_SET_ASSOC-1:0] cl_hit; // hit from tag compare logic cache_rden; // triggers cache lookup logic cache_wren; // triggers write to cacheline logic cmp_en_d, cmp_en_q; // enable tag comparison in next cycle. used to cut long path due to NC signal. logic flush_d, flush_q; // used to register and signal pending flushes // replacement strategy logic update_lfsr; // shift the LFSR logic [$clog2(ICACHE_SET_ASSOC)-1:0] inv_way; // first non-valid encountered logic [$clog2(ICACHE_SET_ASSOC)-1:0] rnd_way; // random index for replacement logic [$clog2(ICACHE_SET_ASSOC)-1:0] repl_way; // way to replace logic [ICACHE_SET_ASSOC-1:0] repl_way_oh_d, repl_way_oh_q; // way to replace (onehot) logic all_ways_valid; // we need to switch repl strategy since all are valid // invalidations / flushing logic inv_en; // incoming invalidations logic flush_en, flush_done; // used to flush cache entries logic [ICACHE_CL_IDX_WIDTH-1:0] flush_cnt_d, flush_cnt_q; // used to flush cache entries // mem arrays logic cl_we; // write enable to memory array logic [ICACHE_SET_ASSOC-1:0] cl_req; // request to memory array logic [ICACHE_CL_IDX_WIDTH-1:0] cl_index; // this is a cache-line index, to memory array logic [ICACHE_OFFSET_WIDTH-1:0] cl_offset_d, cl_offset_q; // offset in cache line logic [ICACHE_TAG_WIDTH-1:0] cl_tag_d, cl_tag_q; // this is the cache tag logic [ICACHE_TAG_WIDTH-1:0] cl_tag_rdata [ICACHE_SET_ASSOC-1:0]; // these are the tags coming from the tagmem logic [ICACHE_LINE_WIDTH-1:0] cl_rdata [ICACHE_SET_ASSOC-1:0]; // these are the cachelines coming from the cache logic [ICACHE_SET_ASSOC-1:0][FETCH_WIDTH-1:0]cl_sel; // selected word from each cacheline logic [ICACHE_SET_ASSOC-1:0] vld_req; // bit enable for valid regs logic vld_we; // valid bits write enable logic [ICACHE_SET_ASSOC-1:0] vld_wdata; // valid bits to write logic [ICACHE_SET_ASSOC-1:0] vld_rdata; // valid bits coming from valid regs logic [ICACHE_CL_IDX_WIDTH-1:0] vld_addr; // valid bit // cpmtroller FSM typedef enum logic[2:0] {FLUSH, IDLE, READ, MISS, TLB_MISS, KILL_ATRANS, KILL_MISS} state_t; state_t state_d, state_q; /////////////////////////////////////////////////////// // address -> cl_index mapping, interface plumbing /////////////////////////////////////////////////////// // extract tag from physical address, check if NC assign cl_tag_d = (areq_i.fetch_valid) ? areq_i.fetch_paddr[ICACHE_TAG_WIDTH+ICACHE_INDEX_WIDTH-1:ICACHE_INDEX_WIDTH] : cl_tag_q; // noncacheable if request goes to I/O space, or if cache is disabled assign paddr_is_nc = (cl_tag_d < (CachedAddrBeg>>ICACHE_INDEX_WIDTH)) || (cl_tag_d >= (CachedAddrEnd>>ICACHE_INDEX_WIDTH)) || (!cache_en_q); // pass exception through assign dreq_o.ex = areq_i.fetch_exception; // latch this in case we have to stall later on // make sure this is 32bit aligned assign vaddr_d = (dreq_o.ready & dreq_i.req) ? dreq_i.vaddr : vaddr_q; assign areq_o.fetch_vaddr = {vaddr_q>>2, 2'b0}; // split virtual address into index and offset to address cache arrays assign cl_index = vaddr_d[ICACHE_INDEX_WIDTH-1:ICACHE_OFFSET_WIDTH]; generate if(Axi64BitCompliant)begin // if we generate a noncacheable access, the word will be at offset 0 or 4 in the cl coming from memory assign cl_offset_d = ( dreq_o.ready & dreq_i.req) ? {dreq_i.vaddr>>2, 2'b0} : ( paddr_is_nc & mem_data_req_o ) ? cl_offset_q[2]<<2 : // needed since we transfer 32bit over a 64bit AXI bus in this case cl_offset_q; // request word address instead of cl address in case of NC access assign mem_data_o.paddr = (paddr_is_nc) ? {cl_tag_d, vaddr_q[ICACHE_INDEX_WIDTH-1:3], 3'b0} : // align to 32bit {cl_tag_d, vaddr_q[ICACHE_INDEX_WIDTH-1:ICACHE_OFFSET_WIDTH], {ICACHE_OFFSET_WIDTH{1'b0}}}; // align to cl end if(!Axi64BitCompliant)begin // icache fills are either cachelines or 4byte fills, depending on whether they go to the Piton I/O space or not. // since the piton cache system replicates the data, we can always index the full CL assign cl_offset_d = ( dreq_o.ready & dreq_i.req) ? {dreq_i.vaddr>>2, 2'b0} : cl_offset_q; // request word address instead of cl address in case of NC access assign mem_data_o.paddr = (paddr_is_nc) ? {cl_tag_d, vaddr_q[ICACHE_INDEX_WIDTH-1:2], 2'b0} : // align to 32bit {cl_tag_d, vaddr_q[ICACHE_INDEX_WIDTH-1:ICACHE_OFFSET_WIDTH], {ICACHE_OFFSET_WIDTH{1'b0}}}; // align to cl end endgenerate assign mem_data_o.tid = RdTxId; assign mem_data_o.nc = paddr_is_nc; // way that is being replaced assign mem_data_o.way = repl_way; assign dreq_o.vaddr = vaddr_q; /////////////////////////////////////////////////////// // main control logic /////////////////////////////////////////////////////// always_comb begin : p_fsm // default assignment state_d = state_q; cache_en_d = cache_en_q & en_i;// disabling the cache is always possible, enable needs to go via flush flush_en = 1'b0; cmp_en_d = 1'b0; cache_rden = 1'b0; cache_wren = 1'b0; inv_en = 1'b0; flush_d = flush_q | flush_i; // register incoming flush // interfaces dreq_o.ready = 1'b0; areq_o.fetch_req = 1'b0; dreq_o.valid = 1'b0; mem_data_req_o = 1'b0; // performance counter miss_o = 1'b0; // handle invalidations unconditionally // note: invald are mutually exclusive with // ifills, since both arrive over the same IF // however, we need to make sure below that we // do not trigger a cache readout at the same time... if (mem_rtrn_vld_i && mem_rtrn_i.rtype == ICACHE_INV_REQ) begin inv_en = 1'b1; end unique case (state_q) ////////////////////////////////// // this clears all valid bits FLUSH: begin flush_en = 1'b1; if (flush_done) begin state_d = IDLE; flush_d = 1'b0; // if the cache was not enabled set this cache_en_d = en_i; end end ////////////////////////////////// // wait for an incoming request IDLE: begin // only enable tag comparison if cache is enabled cmp_en_d = cache_en_q; // handle pending flushes, or perform cache clear upon enable if (flush_d | (en_i & ~cache_en_q)) begin state_d = FLUSH; // wait for incoming requests end else begin // mem requests are for sure invals here if (~mem_rtrn_vld_i) begin dreq_o.ready = 1'b1; // we have a new request if (dreq_i.req) begin cache_rden = 1'b1; state_d = READ; end end if (dreq_i.kill_s1) begin state_d = IDLE; end end end ////////////////////////////////// // check whether we have a hit // in case the cache is disabled, // or in case the address is NC, we // reuse the miss mechanism to handle // the request READ: begin state_d = TLB_MISS; areq_o.fetch_req = '1; // only enable tag comparison if cache is enabled cmp_en_d = cache_en_q; // readout speculatively cache_rden = cache_en_q; if (areq_i.fetch_valid) begin // check if we have to flush if (flush_d) begin state_d = IDLE; // we have a hit or an exception output valid result end else if ((|cl_hit & cache_en_q) | areq_i.fetch_exception.valid) begin dreq_o.valid = ~dreq_i.kill_s2;// just don't output in this case state_d = IDLE; // we can accept another request // and stay here, but only if no inval is coming in // note: we are not expecting ifill return packets here... if (~mem_rtrn_vld_i) begin dreq_o.ready = 1'b1; if (dreq_i.req) begin state_d = READ; end end // if a request is being killed at this stage, // we have to bail out and wait for the address translation to complete if (dreq_i.kill_s1) begin state_d = IDLE; end // we have a miss / NC transaction end else if (dreq_i.kill_s2) begin state_d = IDLE; end else begin cmp_en_d = 1'b0; // only count this as a miss if the cache is enabled, and // the address is cacheable // send out ifill request mem_data_req_o = 1'b1; if (mem_data_ack_i) begin miss_o = (~paddr_is_nc); state_d = MISS; end end // bail out if this request is being killed (and we missed on the TLB) end else if (dreq_i.kill_s2 | flush_d) begin state_d = KILL_ATRANS; end end ////////////////////////////////// // wait until the memory transaction // returns. do not write to memory // if the nc bit is set. TLB_MISS: begin areq_o.fetch_req = '1; // only enable tag comparison if cache is enabled cmp_en_d = cache_en_q; // readout speculatively cache_rden = cache_en_q; if (areq_i.fetch_valid) begin // check if we have to kill this request if (dreq_i.kill_s2 | flush_d) begin state_d = IDLE; // check whether we got an exception end else if (areq_i.fetch_exception.valid) begin dreq_o.valid = 1'b1; state_d = IDLE; // re-trigger cache readout for tag comparison and cache line selection // but if we got an invalidation, we have to wait another cycle end else if (~mem_rtrn_vld_i) begin state_d = READ; end // bail out if this request is being killed end else if (dreq_i.kill_s2 | flush_d) begin state_d = KILL_ATRANS; end end ////////////////////////////////// // wait until the memory transaction // returns. do not write to memory // if the nc bit is set. MISS: begin // note: this is mutually exclusive with ICACHE_INV_REQ, // so we do not have to check for invals here if (mem_rtrn_vld_i && mem_rtrn_i.rtype == ICACHE_IFILL_ACK) begin state_d = IDLE; // only return data if request is not being killed if (~(dreq_i.kill_s2 | flush_d)) begin dreq_o.valid = 1'b1; // only write to cache if this address is cacheable cache_wren = ~paddr_is_nc; end // bail out if this request is being killed end else if (dreq_i.kill_s2 | flush_d) begin state_d = KILL_MISS; end end ////////////////////////////////// // killed address translation, // wait until paddr is valid, and go // back to idle KILL_ATRANS: begin areq_o.fetch_req = '1; if (areq_i.fetch_valid) begin state_d = IDLE; end end ////////////////////////////////// // killed miss, // wait until memory responds and // go back to idle KILL_MISS: begin if (mem_rtrn_vld_i && mem_rtrn_i.rtype == ICACHE_IFILL_ACK) begin state_d = IDLE; end end default: begin // we should never get here state_d = FLUSH; end endcase // state_q end /////////////////////////////////////////////////////// // valid bit invalidation and replacement strategy /////////////////////////////////////////////////////// // note: it cannot happen that we get an invalidation + a cl replacement // in the same cycle as these requests arrive via the same interface // flushes take precedence over invalidations (it is ok if we ignore // the inval since the cache is cleared anyway) assign flush_cnt_d = (flush_done) ? '0 : (flush_en) ? flush_cnt_q + 1 : flush_cnt_q; assign flush_done = (flush_cnt_q==(ICACHE_NUM_WORDS-1)); // invalidation/clearing address // flushing takes precedence over invals assign vld_addr = (flush_en) ? flush_cnt_q : (inv_en) ? mem_rtrn_i.inv.idx[ICACHE_INDEX_WIDTH-1:ICACHE_OFFSET_WIDTH] : cl_index; assign vld_req = (flush_en | cache_rden) ? '1 : (mem_rtrn_i.inv.all & inv_en) ? '1 : (mem_rtrn_i.inv.vld & inv_en) ? icache_way_bin2oh(mem_rtrn_i.inv.way) : repl_way_oh_q; assign vld_wdata = (cache_wren) ? '1 : '0; assign vld_we = (cache_wren | inv_en | flush_en); // assign vld_req = (vld_we | cache_rden); // chose random replacement if all are valid assign update_lfsr = cache_wren & all_ways_valid; assign repl_way = (all_ways_valid) ? rnd_way : inv_way; assign repl_way_oh_d = (cmp_en_q) ? icache_way_bin2oh(repl_way) : repl_way_oh_q; // enable signals for memory arrays assign cl_req = (cache_rden) ? '1 : (cache_wren) ? repl_way_oh_q : '0; assign cl_we = cache_wren; // find invalid cache line lzc #( .WIDTH ( ICACHE_SET_ASSOC ) ) i_lzc ( .in_i ( ~vld_rdata ), .cnt_o ( inv_way ), .empty_o ( all_ways_valid ) ); // generate random cacheline index lfsr_8bit #( .WIDTH (ICACHE_SET_ASSOC) ) i_lfsr ( .clk_i ( clk_i ), .rst_ni ( rst_ni ), .en_i ( update_lfsr ), .refill_way_oh ( ), .refill_way_bin ( rnd_way ) ); /////////////////////////////////////////////////////// // tag comparison, hit generation /////////////////////////////////////////////////////// logic [$clog2(ICACHE_SET_ASSOC)-1:0] hit_idx; generate for (genvar i=0;i<ICACHE_SET_ASSOC;i++) begin : g_tag_cmpsel assign cl_hit[i] = (cl_tag_rdata[i] == cl_tag_d) & vld_rdata[i]; assign cl_sel[i] = cl_rdata[i][{cl_offset_q,3'b0} +: FETCH_WIDTH]; end endgenerate lzc #( .WIDTH ( ICACHE_SET_ASSOC ) ) i_lzc_hit ( .in_i ( cl_hit ), .cnt_o ( hit_idx ), .empty_o ( ) ); assign dreq_o.data = ( cmp_en_q ) ? cl_sel[hit_idx] : mem_rtrn_i.data[{cl_offset_q,3'b0} +: FETCH_WIDTH]; /////////////////////////////////////////////////////// // memory arrays and regs /////////////////////////////////////////////////////// logic [ICACHE_TAG_WIDTH:0] cl_tag_valid_rdata [ICACHE_SET_ASSOC-1:0]; generate for (genvar i = 0; i < ICACHE_SET_ASSOC; i++) begin : g_sram // Tag RAM sram #( // tag + valid bit .DATA_WIDTH ( ICACHE_TAG_WIDTH+1 ), .NUM_WORDS ( ICACHE_NUM_WORDS ) ) tag_sram ( .clk_i ( clk_i ), .rst_ni ( rst_ni ), .req_i ( vld_req[i] ), .we_i ( vld_we ), .addr_i ( vld_addr ), // we can always use the saved tag here since it takes a // couple of cycle until we write to the cache upon a miss .wdata_i ( {vld_wdata[i], cl_tag_q} ), .be_i ( '1 ), .rdata_o ( cl_tag_valid_rdata[i] ) ); assign cl_tag_rdata[i] = cl_tag_valid_rdata[i][ICACHE_TAG_WIDTH-1:0]; assign vld_rdata[i] = cl_tag_valid_rdata[i][ICACHE_TAG_WIDTH]; // Data RAM sram #( .DATA_WIDTH ( ICACHE_LINE_WIDTH ), .NUM_WORDS ( ICACHE_NUM_WORDS ) ) data_sram ( .clk_i ( clk_i ), .rst_ni ( rst_ni ), .req_i ( cl_req[i] ), .we_i ( cl_we ), .addr_i ( cl_index ), .wdata_i ( mem_rtrn_i.data ), .be_i ( '1 ), .rdata_o ( cl_rdata[i] ) ); end endgenerate always_ff @(posedge clk_i or negedge rst_ni) begin : p_regs if(~rst_ni) begin cl_tag_q <= '0; flush_cnt_q <= '0; vaddr_q <= '0; cmp_en_q <= '0; cache_en_q <= '0; flush_q <= '0; state_q <= IDLE; cl_offset_q <= '0; repl_way_oh_q <= '0; end else begin cl_tag_q <= cl_tag_d; flush_cnt_q <= flush_cnt_d; vaddr_q <= vaddr_d; cmp_en_q <= cmp_en_d; cache_en_q <= cache_en_d; flush_q <= flush_d; state_q <= state_d; cl_offset_q <= cl_offset_d; repl_way_oh_q <= repl_way_oh_d; end end /////////////////////////////////////////////////////// // assertions /////////////////////////////////////////////////////// //pragma translate_off `ifndef VERILATOR noncacheable0: assert property ( @(posedge clk_i) disable iff (~rst_ni) paddr_is_nc |-> mem_rtrn_vld_i |-> state_q != KILL_MISS |-> mem_rtrn_i.rtype == ICACHE_IFILL_ACK |-> mem_rtrn_i.nc) else $fatal(1,"[l1 icache] NC paddr implies nc ifill"); noncacheable1: assert property ( @(posedge clk_i) disable iff (~rst_ni) mem_rtrn_vld_i |-> state_q != KILL_MISS |-> mem_rtrn_i.f4b |-> mem_rtrn_i.nc) else $fatal(1,"[l1 icache] 4b ifill implies NC"); repl_inval0: assert property ( @(posedge clk_i) disable iff (~rst_ni) cache_wren |-> ~(mem_rtrn_i.inv.all | mem_rtrn_i.inv.vld)) else $fatal(1,"[l1 icache] cannot replace cacheline and invalidate cacheline simultaneously"); repl_inval1: assert property ( @(posedge clk_i) disable iff (~rst_ni) (mem_rtrn_i.inv.all | mem_rtrn_i.inv.vld) |-> ~cache_wren) else $fatal(1,"[l1 icache] cannot replace cacheline and invalidate cacheline simultaneously"); invalid_state: assert property ( @(posedge clk_i) disable iff (~rst_ni) (state_q inside {FLUSH, IDLE, READ, MISS, TLB_MISS, KILL_ATRANS, KILL_MISS})) else $fatal(1,"[l1 icache] fsm reached an invalid state"); hot1: assert property ( @(posedge clk_i) disable iff (~rst_ni) (~inv_en) |=> cmp_en_q |-> $onehot0(cl_hit)) else $fatal(1,"[l1 icache] cl_hit signal must be hot1"); initial begin // assert wrong parameterizations assert (ICACHE_INDEX_WIDTH<=12) else $fatal(1,"[l1 icache] cache index width can be maximum 12bit since VM uses 4kB pages"); end `endif //pragma translate_on endmodule // serpent_icache