// Copyright 2018 ETH Zurich and University of Bologna. // Copyright and related rights are licensed under the Solderpad Hardware // License, Version 0.51 (the "License"); you may not use this file except in // compliance with the License. You may obtain a copy of the License at // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law // or agreed to in writing, software, hardware and materials distributed under // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. // // Author: Michael Schaffner <schaffner@iis.ee.ethz.ch>, ETH Zurich // Date: 13.09.2018 // Description: coalescing write buffer for serpent dcache // // A couple of notes: // // 1) the write buffer behaves as a fully-associative cache, and is therefore coalescing. // this cache is used by the cache readout logic to forward data to the load unit. // // each byte can be in the following states (valid/dirty/txblock): // // 0/0/0: invalid -> free entry in the buffer // 1/1/0: valid and dirty, Byte is hence not part of TX in-flight // 1/0/1: valid and not dirty, Byte is part of a TX in-flight // 1/1/1: valid and, part of tx and dirty. this means that the byte has been // overwritten while in TX and needs to be retransmitted once the write of that byte returns. // 1/0/0: this would represent a clean state, but is never reached in the wbuffer in the current implementation. // this is because when a TX returns, and the byte is in state [1/0/1], it is written to cache if needed and // its state is immediately cleared to 0/x/x. // // this state is used to distinguish between bytes that have been written and not // yet sent to the memory subsystem, and bytes that are part of a transaction. // // 2) further, each word in the write buffer has a cache states (checked, hit_oh) // // checked == 0: unknown cache state // checked == 1: cache state has been looked up, valid way is stored in "hit_oh" // // cache invalidations/refills affecting a particular word will clear its word state to 0, // so another lookup has to be done. note that these lookups are triggered as soon as there is // a valid word with checked == 0 in the write buffer. // // 3) returning write ACKs trigger a cache update if the word is present in the cache, and evict that // word from the write buffer. if the word is not allocated to the cache, it is just evicted from the write buffer. // if the word cache state is VOID, the pipeline is stalled until it is clear whether that word is in the cache or not. // // 4) we handle NC writes using the writebuffer circuitry. upon an NC request, the writebuffer will first be drained. // then, only the NC word is written into the write buffer and no further write requests are acknowledged until that // word has been evicted from the write buffer. import ariane_pkg::*; import serpent_cache_pkg::*; module serpent_dcache_wbuffer #( parameter logic [63:0] CachedAddrBeg = 64'h00_8000_0000, // begin of cached region parameter logic [63:0] CachedAddrEnd = 64'h80_0000_0000 // end of cached region ) ( input logic clk_i, // Clock input logic rst_ni, // Asynchronous reset active low input logic cache_en_i, // writes are treated as NC if disabled output logic empty_o, // asserted if no data is present in write buffer // core request ports input dcache_req_i_t req_port_i, output dcache_req_o_t req_port_o, // interface to miss handler input logic miss_ack_i, output logic [63:0] miss_paddr_o, output logic miss_req_o, output logic miss_we_o, // always 1 here output logic [63:0] miss_wdata_o, output logic [DCACHE_SET_ASSOC-1:0] miss_vld_bits_o, // unused here (set to 0) output logic miss_nc_o, // request to I/O space output logic [2:0] miss_size_o, // output logic [DCACHE_ID_WIDTH-1:0] miss_id_o, // ID of this transaction (wbuffer uses all IDs from 0 to DCACHE_MAX_TX-1) // write responses from memory input logic miss_rtrn_vld_i, input logic [DCACHE_ID_WIDTH-1:0] miss_rtrn_id_i, // transaction ID to clear // cache read interface output logic [DCACHE_TAG_WIDTH-1:0] rd_tag_o, // tag in - comes one cycle later output logic [DCACHE_CL_IDX_WIDTH-1:0] rd_idx_o, output logic [DCACHE_OFFSET_WIDTH-1:0] rd_off_o, output logic rd_req_o, // read the word at offset off_i[:3] in all ways output logic rd_tag_only_o, // set to 1 here as we do not have to read the data arrays input logic rd_ack_i, input logic [63:0] rd_data_i, // unused input logic [DCACHE_SET_ASSOC-1:0] rd_vld_bits_i, // unused input logic [DCACHE_SET_ASSOC-1:0] rd_hit_oh_i, // cacheline writes input logic wr_cl_vld_i, input logic [DCACHE_CL_IDX_WIDTH-1:0] wr_cl_idx_i, // cache word write interface output logic [DCACHE_SET_ASSOC-1:0] wr_req_o, input logic wr_ack_i, output logic [DCACHE_CL_IDX_WIDTH-1:0] wr_idx_o, output logic [DCACHE_OFFSET_WIDTH-1:0] wr_off_o, output logic [63:0] wr_data_o, output logic [7:0] wr_data_be_o, // to forwarding logic and miss unit output wbuffer_t [DCACHE_WBUF_DEPTH-1:0] wbuffer_data_o, output logic [DCACHE_MAX_TX-1:0][63:0] tx_paddr_o, // used to check for address collisions with read operations output logic [DCACHE_MAX_TX-1:0] tx_vld_o ); tx_stat_t [DCACHE_MAX_TX-1:0] tx_stat_d, tx_stat_q; wbuffer_t [DCACHE_WBUF_DEPTH-1:0] wbuffer_d, wbuffer_q; logic [DCACHE_WBUF_DEPTH-1:0] valid; logic [DCACHE_WBUF_DEPTH-1:0] dirty; logic [DCACHE_WBUF_DEPTH-1:0] tocheck; logic [DCACHE_WBUF_DEPTH-1:0] wbuffer_hit_oh, inval_hit; logic [DCACHE_WBUF_DEPTH-1:0][7:0] bdirty; logic [$clog2(DCACHE_WBUF_DEPTH)-1:0] next_ptr, dirty_ptr, hit_ptr, wr_ptr, check_ptr_d, check_ptr_q, check_ptr_q1, rtrn_ptr; logic [DCACHE_ID_WIDTH-1:0] tx_id, rtrn_id; logic [2:0] bdirty_off; logic [7:0] tx_be; logic [63:0] wr_paddr, rd_paddr; logic [DCACHE_TAG_WIDTH-1:0] rd_tag_d, rd_tag_q; logic [DCACHE_SET_ASSOC-1:0] rd_hit_oh_d, rd_hit_oh_q; logic check_en_d, check_en_q, check_en_q1; logic full, dirty_rd_en, rdy; logic rtrn_empty, evict; logic nc_pending_d, nc_pending_q, addr_is_nc; logic wbuffer_wren; logic free_tx_slots; logic wr_cl_vld_q, wr_cl_vld_d; logic [DCACHE_CL_IDX_WIDTH-1:0] wr_cl_idx_q, wr_cl_idx_d; logic [63:0] debug_paddr [DCACHE_WBUF_DEPTH-1:0]; /////////////////////////////////////////////////////// // misc /////////////////////////////////////////////////////// assign miss_nc_o = nc_pending_q; assign addr_is_nc = (req_port_i.address_tag < (CachedAddrBeg>>DCACHE_INDEX_WIDTH)) || (req_port_i.address_tag >= (CachedAddrEnd>>DCACHE_INDEX_WIDTH)) || (!cache_en_i); assign miss_we_o = 1'b1; assign miss_vld_bits_o = '0; assign wbuffer_data_o = wbuffer_q; generate for(genvar k=0; k<DCACHE_MAX_TX;k++) begin assign tx_vld_o[k] = tx_stat_q[k].vld; assign tx_paddr_o[k] = wbuffer_q[tx_stat_q[k].ptr].wtag<<3; end endgenerate /////////////////////////////////////////////////////// // openpiton does not understand byte enable sigs // need to convert to the four cases: // 00: byte // 01: halfword // 10: word // 11: dword // non-contiguous writes need to be serialized! // e.g. merged dwords with BE like this: 8'b01001100 /////////////////////////////////////////////////////// // get byte offset lzc #( .WIDTH ( 8 ) ) i_vld_bdirty ( .in_i ( bdirty[dirty_ptr] ), .cnt_o ( bdirty_off ), .empty_o ( ) ); // add the offset to the physical base address of this buffer entry assign miss_paddr_o = {wbuffer_q[dirty_ptr].wtag, bdirty_off}; assign miss_id_o = tx_id; // is there any dirty word to be transmitted, and is there a free TX slot? assign miss_req_o = (|dirty) && free_tx_slots; // get size of aligned words, and the corresponding byte enables // note: openpiton can only handle aligned offsets + size, and hence // we have to split unaligned data into multiple transfers (see toSize64) // e.g. if we have the following valid bytes: 0011_1001 -> TX0: 0000_0001, TX1: 0000_1000, TX2: 0011_0000 assign miss_size_o = toSize64(bdirty[dirty_ptr]); // replicate transfers shorter than a dword assign miss_wdata_o = repData64(wbuffer_q[dirty_ptr].data, bdirty_off, miss_size_o[1:0]); assign tx_be = toByteEnable8(bdirty_off, miss_size_o[1:0]); /////////////////////////////////////////////////////// // TX status registers and ID counters /////////////////////////////////////////////////////// // TODO: todo: make this fall through if timing permits it fifo_v2 #( .FALL_THROUGH ( 1'b0 ), .DATA_WIDTH ( $clog2(DCACHE_MAX_TX) ), .DEPTH ( DCACHE_MAX_TX ) ) i_rtrn_id_fifo ( .clk_i ( clk_i ), .rst_ni ( rst_ni ), .flush_i ( 1'b0 ), .testmode_i ( 1'b0 ), .full_o ( ), .empty_o ( rtrn_empty ), .alm_full_o ( ), .alm_empty_o( ), .data_i ( miss_rtrn_id_i ), .push_i ( miss_rtrn_vld_i ), .data_o ( rtrn_id ), .pop_i ( evict ) ); always_comb begin : p_tx_stat tx_stat_d = tx_stat_q; evict = 1'b0; wr_req_o = '0; // clear entry if it is clear whether it can be pushed to the cache or not if((~rtrn_empty) && wbuffer_q[rtrn_ptr].checked) begin // check if data is clean and can be written, otherwise skip // check if CL is present, otherwise skip if((|wr_data_be_o) && (|wbuffer_q[rtrn_ptr].hit_oh)) begin wr_req_o = wbuffer_q[rtrn_ptr].hit_oh; if(wr_ack_i) begin evict = 1'b1; tx_stat_d[rtrn_id].vld = 1'b0; end end else begin evict = 1'b1; tx_stat_d[rtrn_id].vld = 1'b0; end end // allocate a new entry if(dirty_rd_en) begin tx_stat_d[tx_id].vld = 1'b1; tx_stat_d[tx_id].ptr = dirty_ptr; tx_stat_d[tx_id].be = tx_be; end end assign free_tx_slots = |(~tx_vld_o); // get free TX slot rrarbiter #( .NUM_REQ ( DCACHE_MAX_TX ), .LOCK_IN ( 1 )// lock the decision, once request is asserted ) i_tx_id_rr ( .clk_i ( clk_i ), .rst_ni ( rst_ni ), .flush_i ( 1'b0 ), .en_i ( dirty_rd_en ), .req_i ( ~tx_vld_o ), .ack_o ( ), .vld_o ( ), .idx_o ( tx_id ) ); /////////////////////////////////////////////////////// // cache readout & update /////////////////////////////////////////////////////// assign rd_tag_d = rd_paddr>>DCACHE_INDEX_WIDTH; // trigger TAG readout in cache assign rd_tag_only_o = 1'b1; assign rd_paddr = wbuffer_q[check_ptr_d].wtag<<3; assign rd_req_o = |tocheck; assign rd_tag_o = rd_tag_q;//delay by one cycle assign rd_idx_o = rd_paddr[DCACHE_INDEX_WIDTH-1:DCACHE_OFFSET_WIDTH]; assign rd_off_o = rd_paddr[DCACHE_OFFSET_WIDTH-1:0]; assign check_en_d = rd_req_o & rd_ack_i; // cache update port assign rtrn_ptr = tx_stat_q[rtrn_id].ptr; // if we wrote into a word while it was in-flight, we cannot write the dirty bytes to the cache // when the TX returns assign wr_data_be_o = tx_stat_q[rtrn_id].be & (~wbuffer_q[rtrn_ptr].dirty); assign wr_paddr = wbuffer_q[rtrn_ptr].wtag<<3; assign wr_idx_o = wr_paddr[DCACHE_INDEX_WIDTH-1:DCACHE_OFFSET_WIDTH]; assign wr_off_o = wr_paddr[DCACHE_OFFSET_WIDTH-1:0]; assign wr_data_o = wbuffer_q[rtrn_ptr].data; /////////////////////////////////////////////////////// // readout of status bits, index calculation /////////////////////////////////////////////////////// assign wr_cl_vld_d = wr_cl_vld_i; assign wr_cl_idx_d = wr_cl_idx_i; generate for(genvar k=0; k<DCACHE_WBUF_DEPTH; k++) begin // only for debug, will be pruned assign debug_paddr[k] = wbuffer_q[k].wtag << 3; // dirty bytes that are ready for transmission. // note that we cannot retransmit a word that is already in-flight // since the multiple transactions might overtake each other in the memory system! assign bdirty[k] = (|wbuffer_q[k].txblock) ? '0 : wbuffer_q[k].dirty & wbuffer_q[k].valid; assign dirty[k] = |bdirty[k]; assign valid[k] = |wbuffer_q[k].valid; assign wbuffer_hit_oh[k] = valid[k] & (wbuffer_q[k].wtag == {req_port_i.address_tag, req_port_i.address_index[DCACHE_INDEX_WIDTH-1:3]}); // checks if an invalidation/cache refill hits a particular word // note: an invalidation can hit multiple words! // need to respect previous cycle, too, since we add a cycle of latency to the rd_hit_oh_i signal... assign inval_hit[k] = (wr_cl_vld_d & valid[k] & (wbuffer_q[k].wtag[DCACHE_INDEX_WIDTH-1:0]<<3 == wr_cl_idx_d<<DCACHE_OFFSET_WIDTH)) | (wr_cl_vld_q & valid[k] & (wbuffer_q[k].wtag[DCACHE_INDEX_WIDTH-1:0]<<3 == wr_cl_idx_q<<DCACHE_OFFSET_WIDTH)); // these word have to be looked up in the cache assign tocheck[k] = (~wbuffer_q[k].checked) & valid[k]; end endgenerate assign wr_ptr = (|wbuffer_hit_oh) ? hit_ptr : next_ptr; assign empty_o = ~(|valid); assign rdy = (|wbuffer_hit_oh) | (~full); // next free entry in the buffer lzc #( .WIDTH ( DCACHE_WBUF_DEPTH ) ) i_vld_lzc ( .in_i ( ~valid ), .cnt_o ( next_ptr ), .empty_o ( full ) ); // get index of hit lzc #( .WIDTH ( DCACHE_WBUF_DEPTH ) ) i_hit_lzc ( .in_i ( wbuffer_hit_oh ), .cnt_o ( hit_ptr ), .empty_o ( ) ); // next dirty word to serve rrarbiter #( .NUM_REQ ( DCACHE_WBUF_DEPTH ), .LOCK_IN ( 1 )// lock the decision, once request is asserted ) i_dirty_rr ( .clk_i ( clk_i ), .rst_ni ( rst_ni ), .flush_i ( 1'b0 ), .en_i ( dirty_rd_en ), .req_i ( dirty ), .ack_o ( ), .vld_o ( ), .idx_o ( dirty_ptr ) ); // next word to lookup in the cache rrarbiter #( .NUM_REQ ( DCACHE_WBUF_DEPTH ) ) i_clean_rr ( .clk_i ( clk_i ), .rst_ni ( rst_ni ), .flush_i ( 1'b0 ), .en_i ( check_en_d ), .req_i ( tocheck ), .ack_o ( ), .vld_o ( ), .idx_o ( check_ptr_d ) ); /////////////////////////////////////////////////////// // update logic /////////////////////////////////////////////////////// assign req_port_o.data_rvalid = '0; assign req_port_o.data_rdata = '0; assign rd_hit_oh_d = rd_hit_oh_i; // TODO: rewrite and separate into MUXES and write strobe logic always_comb begin : p_buffer wbuffer_d = wbuffer_q; nc_pending_d = nc_pending_q; dirty_rd_en = 1'b0; req_port_o.data_gnt = 1'b0; wbuffer_wren = 1'b0; // TAG lookup returns, mark corresponding word if(check_en_q1) begin if(wbuffer_q[check_ptr_q1].valid) begin wbuffer_d[check_ptr_q1].checked = 1'b1; wbuffer_d[check_ptr_q1].hit_oh = rd_hit_oh_q; end end // if an invalidation or cache line refill comes in and hits on the write buffer, // we have to discard our knowledge of the corresponding cacheline state for(int k=0; k<DCACHE_WBUF_DEPTH; k++) begin if(inval_hit[k]) begin wbuffer_d[k].checked = 1'b0; end end // once TX write response came back, we can clear the TX block. if it was not dirty, we // can completely evict it - otherwise we have to leave it there for retransmission if(evict) begin for(int k=0; k<8; k++) begin if(tx_stat_q[rtrn_id].be[k]) begin wbuffer_d[rtrn_ptr].txblock[k] = 1'b0; if(~wbuffer_q[rtrn_ptr].dirty[k]) begin wbuffer_d[rtrn_ptr].valid[k] = 1'b0; // NOTE: this is not strictly needed, but makes it much // easier to debug, since no invalid data remains in the buffer // wbuffer_d[rtrn_ptr].data[k*8 +:8] = '0; end end end // if all bytes are evicted, clear the cache status flag if(wbuffer_d[rtrn_ptr].valid == 0) begin wbuffer_d[rtrn_ptr].checked = 1'b0; end end // mark bytes sent out to the memory system if(miss_req_o & miss_ack_i) begin dirty_rd_en = 1'b1; for(int k=0; k<8; k++) begin if(tx_be[k]) begin wbuffer_d[dirty_ptr].dirty[k] = 1'b0; wbuffer_d[dirty_ptr].txblock[k] = 1'b1; end end end // write new word into the buffer if(req_port_i.data_req & rdy) begin // in case we have an NC address, need to drain the buffer first // in case we are serving an NC address, we block until it is written to memory if(empty_o | ~(addr_is_nc | nc_pending_q)) begin wbuffer_wren = 1'b1; req_port_o.data_gnt = 1'b1; nc_pending_d = addr_is_nc; wbuffer_d[wr_ptr].checked = 1'b0; wbuffer_d[wr_ptr].wtag = {req_port_i.address_tag, req_port_i.address_index[DCACHE_INDEX_WIDTH-1:3]}; // mark bytes as dirty for(int k=0; k<8; k++) begin if(req_port_i.data_be[k]) begin wbuffer_d[wr_ptr].valid[k] = 1'b1; wbuffer_d[wr_ptr].dirty[k] = 1'b1; wbuffer_d[wr_ptr].data[k*8 +: 8] = req_port_i.data_wdata[k*8 +: 8]; end end end end end /////////////////////////////////////////////////////// // ff's /////////////////////////////////////////////////////// always_ff @(posedge clk_i or negedge rst_ni) begin : p_regs if(~rst_ni) begin wbuffer_q <= '{default: '0}; tx_stat_q <= '{default: '0}; nc_pending_q <= '0; check_ptr_q <= '0; check_ptr_q1 <= '0; check_en_q <= '0; check_en_q1 <= '0; rd_tag_q <= '0; rd_hit_oh_q <= '0; wr_cl_vld_q <= '0; wr_cl_idx_q <= '0; end else begin wbuffer_q <= wbuffer_d; tx_stat_q <= tx_stat_d; nc_pending_q <= nc_pending_d; check_ptr_q <= check_ptr_d; check_ptr_q1 <= check_ptr_q; check_en_q <= check_en_d; check_en_q1 <= check_en_q; rd_tag_q <= rd_tag_d; rd_hit_oh_q <= rd_hit_oh_d; wr_cl_vld_q <= wr_cl_vld_d; wr_cl_idx_q <= wr_cl_idx_d; end end /////////////////////////////////////////////////////// // assertions /////////////////////////////////////////////////////// //pragma translate_off `ifndef VERILATOR hot1: assert property ( @(posedge clk_i) disable iff (~rst_ni) req_port_i.data_req |-> $onehot0(wbuffer_hit_oh)) else $fatal(1,"[l1 dcache wbuffer] wbuffer_hit_oh signal must be hot1"); tx_status: assert property ( @(posedge clk_i) disable iff (~rst_ni) evict & miss_ack_i & miss_req_o |-> (tx_id != rtrn_id)) else $fatal(1,"[l1 dcache wbuffer] cannot allocate and clear same tx slot id in the same cycle"); tx_valid0: assert property ( @(posedge clk_i) disable iff (~rst_ni) evict |-> tx_stat_q[rtrn_id].vld) else $fatal(1,"[l1 dcache wbuffer] evicting invalid transaction slot"); tx_valid1: assert property ( @(posedge clk_i) disable iff (~rst_ni) evict |-> |wbuffer_q[rtrn_ptr].valid) else $fatal(1,"[l1 dcache wbuffer] wbuffer entry corresponding to this transaction is invalid"); write_full: assert property ( @(posedge clk_i) disable iff (~rst_ni) req_port_i.data_req |-> req_port_o.data_gnt |-> ((~full) | (|wbuffer_hit_oh))) else $fatal(1,"[l1 dcache wbuffer] cannot write if full or no hit"); unused0: assert property ( @(posedge clk_i) disable iff (~rst_ni) ~req_port_i.tag_valid) else $fatal(1,"[l1 dcache wbuffer] req_port_i.tag_valid should not be asserted"); unused1: assert property ( @(posedge clk_i) disable iff (~rst_ni) ~req_port_i.kill_req) else $fatal(1,"[l1 dcache wbuffer] req_port_i.kill_req should not be asserted"); generate for(genvar k=0; k<DCACHE_WBUF_DEPTH; k++) begin for(genvar j=0; j<8; j++) begin byteStates: assert property ( @(posedge clk_i) disable iff (~rst_ni) {wbuffer_q[k].valid[j], wbuffer_q[k].dirty[j], wbuffer_q[k].txblock[j]} inside {3'b000, 3'b110, 3'b101, 3'b111} ) else $fatal(1,"[l1 dcache wbuffer] byte %02d of wbuffer entry %02d has invalid state: valid=%01b, dirty=%01b, txblock=%01b", j,k, wbuffer_q[k].valid[j], wbuffer_q[k].dirty[j], wbuffer_q[k].txblock[j]); end end endgenerate `endif //pragma translate_on endmodule // serpent_dcache_wbuffer