serpent_dcache_wbuffer.sv 21.9 KB
Newer Older
sakundu committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550
// Copyright 2018 ETH Zurich and University of Bologna.
// Copyright and related rights are licensed under the Solderpad Hardware
// License, Version 0.51 (the "License"); you may not use this file except in
// compliance with the License.  You may obtain a copy of the License at
// http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
// or agreed to in writing, software, hardware and materials distributed under
// this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
//
// Author: Michael Schaffner <schaffner@iis.ee.ethz.ch>, ETH Zurich
// Date: 13.09.2018
// Description: coalescing write buffer for serpent dcache
//
// A couple of notes:
//
// 1) the write buffer behaves as a fully-associative cache, and is therefore coalescing.
//    this cache is used by the cache readout logic to forward data to the load unit.
//
//    each byte can be in the following states (valid/dirty/txblock):
//
//    0/0/0:    invalid -> free entry in the buffer
//    1/1/0:    valid and dirty, Byte is hence not part of TX in-flight
//    1/0/1:    valid and not dirty, Byte is part of a TX in-flight
//    1/1/1:    valid and, part of tx and dirty. this means that the byte has been
//              overwritten while in TX and needs to be retransmitted once the write of that byte returns.
//    1/0/0:    this would represent a clean state, but is never reached in the wbuffer in the current implementation.
//              this is because when a TX returns, and the byte is in state [1/0/1], it is written to cache if needed and
//              its state is immediately cleared to 0/x/x.
//
//    this state is used to distinguish between bytes that have been written and not
//    yet sent to the memory subsystem, and bytes that are part of a transaction.
//
// 2) further, each word in the write buffer has a cache states (checked, hit_oh)
//
//    checked == 0: unknown cache state
//    checked == 1: cache state has been looked up, valid way is stored in "hit_oh"
//
//    cache invalidations/refills affecting a particular word will clear its word state to 0,
//    so another lookup has to be done. note that these lookups are triggered as soon as there is
//    a valid word with checked == 0 in the write buffer.
//
// 3) returning write ACKs trigger a cache update if the word is present in the cache, and evict that
//    word from the write buffer. if the word is not allocated to the cache, it is just evicted from the write buffer.
//    if the word cache state is VOID, the pipeline is stalled until it is clear whether that word is in the cache or not.
//
// 4) we handle NC writes using the writebuffer circuitry. upon an NC request, the writebuffer will first be drained.
//    then, only the NC word is written into the write buffer and no further write requests are acknowledged until that
//    word has been evicted from the write buffer.

import ariane_pkg::*;
import serpent_cache_pkg::*;

module serpent_dcache_wbuffer #(
    parameter logic [63:0] CachedAddrBeg = 64'h00_8000_0000, // begin of cached region
    parameter logic [63:0] CachedAddrEnd = 64'h80_0000_0000  // end of cached region
) (
    input  logic                               clk_i,          // Clock
    input  logic                               rst_ni,         // Asynchronous reset active low

    input  logic                               cache_en_i,     // writes are treated as NC if disabled
    output logic                               empty_o,        // asserted if no data is present in write buffer
     // core request ports
    input  dcache_req_i_t                      req_port_i,
    output dcache_req_o_t                      req_port_o,
    // interface to miss handler
    input  logic                               miss_ack_i,
    output logic [63:0]                        miss_paddr_o,
    output logic                               miss_req_o,
    output logic                               miss_we_o,       // always 1 here
    output logic [63:0]                        miss_wdata_o,
    output logic [DCACHE_SET_ASSOC-1:0]        miss_vld_bits_o, // unused here (set to 0)
    output logic                               miss_nc_o,       // request to I/O space
    output logic [2:0]                         miss_size_o,     //
    output logic [DCACHE_ID_WIDTH-1:0]         miss_id_o,       // ID of this transaction (wbuffer uses all IDs from 0 to DCACHE_MAX_TX-1)
    // write responses from memory
    input  logic                               miss_rtrn_vld_i,
    input  logic [DCACHE_ID_WIDTH-1:0]         miss_rtrn_id_i,  // transaction ID to clear
    // cache read interface
    output logic [DCACHE_TAG_WIDTH-1:0]        rd_tag_o,        // tag in - comes one cycle later
    output logic [DCACHE_CL_IDX_WIDTH-1:0]     rd_idx_o,
    output logic [DCACHE_OFFSET_WIDTH-1:0]     rd_off_o,
    output logic                               rd_req_o,        // read the word at offset off_i[:3] in all ways
    output logic                               rd_tag_only_o,   // set to 1 here as we do not have to read the data arrays
    input  logic                               rd_ack_i,
    input logic  [63:0]                        rd_data_i,       // unused
    input logic  [DCACHE_SET_ASSOC-1:0]        rd_vld_bits_i,   // unused
    input logic  [DCACHE_SET_ASSOC-1:0]        rd_hit_oh_i,
    // cacheline writes
    input logic                                wr_cl_vld_i,
    input logic [DCACHE_CL_IDX_WIDTH-1:0]      wr_cl_idx_i,
    // cache word write interface
    output logic [DCACHE_SET_ASSOC-1:0]        wr_req_o,
    input  logic                               wr_ack_i,
    output logic [DCACHE_CL_IDX_WIDTH-1:0]     wr_idx_o,
    output logic [DCACHE_OFFSET_WIDTH-1:0]     wr_off_o,
    output logic [63:0]                        wr_data_o,
    output logic [7:0]                         wr_data_be_o,
    // to forwarding logic and miss unit
    output wbuffer_t  [DCACHE_WBUF_DEPTH-1:0]  wbuffer_data_o,
    output logic [DCACHE_MAX_TX-1:0][63:0]     tx_paddr_o,      // used to check for address collisions with read operations
    output logic [DCACHE_MAX_TX-1:0]           tx_vld_o
);

tx_stat_t [DCACHE_MAX_TX-1:0]             tx_stat_d, tx_stat_q;
wbuffer_t [DCACHE_WBUF_DEPTH-1:0]         wbuffer_d, wbuffer_q;
logic     [DCACHE_WBUF_DEPTH-1:0]         valid;
logic     [DCACHE_WBUF_DEPTH-1:0]         dirty;
logic     [DCACHE_WBUF_DEPTH-1:0]         tocheck;
logic     [DCACHE_WBUF_DEPTH-1:0]         wbuffer_hit_oh, inval_hit;
logic     [DCACHE_WBUF_DEPTH-1:0][7:0]    bdirty;

logic [$clog2(DCACHE_WBUF_DEPTH)-1:0] next_ptr, dirty_ptr, hit_ptr, wr_ptr, check_ptr_d, check_ptr_q, check_ptr_q1, rtrn_ptr;
logic [DCACHE_ID_WIDTH-1:0] tx_id, rtrn_id;

logic [2:0] bdirty_off;
logic [7:0] tx_be;
logic [63:0] wr_paddr, rd_paddr;
logic [DCACHE_TAG_WIDTH-1:0] rd_tag_d, rd_tag_q;
logic [DCACHE_SET_ASSOC-1:0] rd_hit_oh_d, rd_hit_oh_q;
logic check_en_d, check_en_q, check_en_q1;
logic full, dirty_rd_en, rdy;
logic rtrn_empty, evict;
logic nc_pending_d, nc_pending_q, addr_is_nc;
logic wbuffer_wren;
logic free_tx_slots;

logic wr_cl_vld_q, wr_cl_vld_d;
logic [DCACHE_CL_IDX_WIDTH-1:0] wr_cl_idx_q, wr_cl_idx_d;

logic [63:0] debug_paddr [DCACHE_WBUF_DEPTH-1:0];

///////////////////////////////////////////////////////
// misc
///////////////////////////////////////////////////////

assign miss_nc_o = nc_pending_q;

assign addr_is_nc = (req_port_i.address_tag <  (CachedAddrBeg>>DCACHE_INDEX_WIDTH)) ||
                    (req_port_i.address_tag >= (CachedAddrEnd>>DCACHE_INDEX_WIDTH)) ||
                    (!cache_en_i);

assign miss_we_o       = 1'b1;
assign miss_vld_bits_o = '0;
assign wbuffer_data_o  = wbuffer_q;

generate
    for(genvar k=0; k<DCACHE_MAX_TX;k++) begin
        assign tx_vld_o[k]   = tx_stat_q[k].vld;
        assign tx_paddr_o[k] = wbuffer_q[tx_stat_q[k].ptr].wtag<<3;
    end
endgenerate

///////////////////////////////////////////////////////
// openpiton does not understand byte enable sigs
// need to convert to the four cases:
// 00: byte
// 01: halfword
// 10: word
// 11: dword
// non-contiguous writes need to be serialized!
// e.g. merged dwords with BE like this: 8'b01001100
///////////////////////////////////////////////////////

// get byte offset
lzc #(
    .WIDTH ( 8 )
) i_vld_bdirty (
    .in_i    ( bdirty[dirty_ptr] ),
    .cnt_o   ( bdirty_off        ),
    .empty_o (                   )
);

// add the offset to the physical base address of this buffer entry
assign miss_paddr_o = {wbuffer_q[dirty_ptr].wtag, bdirty_off};
assign miss_id_o    = tx_id;

// is there any dirty word to be transmitted, and is there a free TX slot?
assign miss_req_o = (|dirty) && free_tx_slots;

// get size of aligned words, and the corresponding byte enables
// note: openpiton can only handle aligned offsets + size, and hence
// we have to split unaligned data into multiple transfers (see toSize64)
// e.g. if we have the following valid bytes: 0011_1001 -> TX0: 0000_0001, TX1: 0000_1000, TX2: 0011_0000
assign miss_size_o  = toSize64(bdirty[dirty_ptr]);

// replicate transfers shorter than a dword
assign miss_wdata_o = repData64(wbuffer_q[dirty_ptr].data,
                                bdirty_off,
                                miss_size_o[1:0]);

assign tx_be        = toByteEnable8(bdirty_off,
                                    miss_size_o[1:0]);

///////////////////////////////////////////////////////
// TX status registers and ID counters
///////////////////////////////////////////////////////

// TODO: todo: make this fall through if timing permits it
fifo_v2 #(
    .FALL_THROUGH ( 1'b0                  ),
    .DATA_WIDTH   ( $clog2(DCACHE_MAX_TX) ),
    .DEPTH        ( DCACHE_MAX_TX         )
) i_rtrn_id_fifo (
    .clk_i      ( clk_i            ),
    .rst_ni     ( rst_ni           ),
    .flush_i    ( 1'b0             ),
    .testmode_i ( 1'b0             ),
    .full_o     (                  ),
    .empty_o    ( rtrn_empty       ),
    .alm_full_o (                  ),
    .alm_empty_o(                  ),
    .data_i     ( miss_rtrn_id_i   ),
    .push_i     ( miss_rtrn_vld_i  ),
    .data_o     ( rtrn_id          ),
    .pop_i      ( evict            )
);

always_comb begin : p_tx_stat
    tx_stat_d = tx_stat_q;
    evict     = 1'b0;
    wr_req_o  = '0;

    // clear entry if it is clear whether it can be pushed to the cache or not
    if((~rtrn_empty) && wbuffer_q[rtrn_ptr].checked) begin
        // check if data is clean and can be written, otherwise skip
        // check if CL is present, otherwise skip
        if((|wr_data_be_o) && (|wbuffer_q[rtrn_ptr].hit_oh)) begin
            wr_req_o = wbuffer_q[rtrn_ptr].hit_oh;
            if(wr_ack_i) begin
                evict    = 1'b1;
                tx_stat_d[rtrn_id].vld = 1'b0;
            end
        end else begin
            evict = 1'b1;
            tx_stat_d[rtrn_id].vld = 1'b0;
        end
    end

    // allocate a new entry
    if(dirty_rd_en) begin
        tx_stat_d[tx_id].vld = 1'b1;
        tx_stat_d[tx_id].ptr = dirty_ptr;
        tx_stat_d[tx_id].be  = tx_be;
    end
end

assign free_tx_slots = |(~tx_vld_o);

// get free TX slot
rrarbiter #(
    .NUM_REQ ( DCACHE_MAX_TX ),
    .LOCK_IN ( 1             )// lock the decision, once request is asserted
) i_tx_id_rr (
    .clk_i   ( clk_i         ),
    .rst_ni  ( rst_ni        ),
    .flush_i ( 1'b0          ),
    .en_i    ( dirty_rd_en   ),
    .req_i   ( ~tx_vld_o     ),
    .ack_o   (               ),
    .vld_o   (               ),
    .idx_o   ( tx_id         )
);

///////////////////////////////////////////////////////
// cache readout & update
///////////////////////////////////////////////////////

assign rd_tag_d   = rd_paddr>>DCACHE_INDEX_WIDTH;

// trigger TAG readout in cache
assign rd_tag_only_o = 1'b1;
assign rd_paddr   = wbuffer_q[check_ptr_d].wtag<<3;
assign rd_req_o   = |tocheck;
assign rd_tag_o   = rd_tag_q;//delay by one cycle
assign rd_idx_o   = rd_paddr[DCACHE_INDEX_WIDTH-1:DCACHE_OFFSET_WIDTH];
assign rd_off_o   = rd_paddr[DCACHE_OFFSET_WIDTH-1:0];
assign check_en_d = rd_req_o & rd_ack_i;

// cache update port
assign rtrn_ptr     = tx_stat_q[rtrn_id].ptr;
// if we wrote into a word while it was in-flight, we cannot write the dirty bytes to the cache
// when the TX returns
assign wr_data_be_o = tx_stat_q[rtrn_id].be & (~wbuffer_q[rtrn_ptr].dirty);
assign wr_paddr     = wbuffer_q[rtrn_ptr].wtag<<3;
assign wr_idx_o     = wr_paddr[DCACHE_INDEX_WIDTH-1:DCACHE_OFFSET_WIDTH];
assign wr_off_o     = wr_paddr[DCACHE_OFFSET_WIDTH-1:0];
assign wr_data_o    = wbuffer_q[rtrn_ptr].data;


///////////////////////////////////////////////////////
// readout of status bits, index calculation
///////////////////////////////////////////////////////

assign wr_cl_vld_d = wr_cl_vld_i;
assign wr_cl_idx_d = wr_cl_idx_i;

generate
    for(genvar k=0; k<DCACHE_WBUF_DEPTH; k++) begin
        // only for debug, will be pruned
        assign debug_paddr[k] = wbuffer_q[k].wtag << 3;

        // dirty bytes that are ready for transmission.
        // note that we cannot retransmit a word that is already in-flight
        // since the multiple transactions might overtake each other in the memory system!
        assign bdirty[k] = (|wbuffer_q[k].txblock) ? '0 : wbuffer_q[k].dirty & wbuffer_q[k].valid;


        assign dirty[k]          = |bdirty[k];
        assign valid[k]          = |wbuffer_q[k].valid;
        assign wbuffer_hit_oh[k] = valid[k] & (wbuffer_q[k].wtag == {req_port_i.address_tag, req_port_i.address_index[DCACHE_INDEX_WIDTH-1:3]});

        // checks if an invalidation/cache refill hits a particular word
        // note: an invalidation can hit multiple words!
        // need to respect previous cycle, too, since we add a cycle of latency to the rd_hit_oh_i signal...
        assign inval_hit[k]  = (wr_cl_vld_d & valid[k] & (wbuffer_q[k].wtag[DCACHE_INDEX_WIDTH-1:0]<<3 == wr_cl_idx_d<<DCACHE_OFFSET_WIDTH)) |
                               (wr_cl_vld_q & valid[k] & (wbuffer_q[k].wtag[DCACHE_INDEX_WIDTH-1:0]<<3 == wr_cl_idx_q<<DCACHE_OFFSET_WIDTH));

        // these word have to be looked up in the cache
        assign tocheck[k]       = (~wbuffer_q[k].checked) & valid[k];
    end
endgenerate

assign wr_ptr     = (|wbuffer_hit_oh) ? hit_ptr : next_ptr;
assign empty_o    = ~(|valid);
assign rdy        = (|wbuffer_hit_oh) | (~full);

// next free entry in the buffer
lzc #(
    .WIDTH ( DCACHE_WBUF_DEPTH )
) i_vld_lzc (
    .in_i    ( ~valid        ),
    .cnt_o   ( next_ptr      ),
    .empty_o ( full          )
);

// get index of hit
lzc #(
    .WIDTH ( DCACHE_WBUF_DEPTH )
) i_hit_lzc (
    .in_i    ( wbuffer_hit_oh ),
    .cnt_o   ( hit_ptr        ),
    .empty_o (                )
);

// next dirty word to serve
rrarbiter #(
    .NUM_REQ ( DCACHE_WBUF_DEPTH ),
    .LOCK_IN ( 1                 )// lock the decision, once request is asserted
) i_dirty_rr (
    .clk_i   ( clk_i         ),
    .rst_ni  ( rst_ni        ),
    .flush_i ( 1'b0          ),
    .en_i    ( dirty_rd_en   ),
    .req_i   ( dirty         ),
    .ack_o   (               ),
    .vld_o   (               ),
    .idx_o   ( dirty_ptr     )
);

// next word to lookup in the cache
rrarbiter #(
    .NUM_REQ ( DCACHE_WBUF_DEPTH )
) i_clean_rr (
    .clk_i   ( clk_i         ),
    .rst_ni  ( rst_ni        ),
    .flush_i ( 1'b0          ),
    .en_i    ( check_en_d    ),
    .req_i   ( tocheck       ),
    .ack_o   (               ),
    .vld_o   (               ),
    .idx_o   ( check_ptr_d   )
);

///////////////////////////////////////////////////////
// update logic
///////////////////////////////////////////////////////

assign req_port_o.data_rvalid = '0;
assign req_port_o.data_rdata  = '0;

assign rd_hit_oh_d = rd_hit_oh_i;

// TODO: rewrite and separate into MUXES and write strobe logic
always_comb begin : p_buffer
    wbuffer_d           = wbuffer_q;
    nc_pending_d        = nc_pending_q;
    dirty_rd_en         = 1'b0;
    req_port_o.data_gnt = 1'b0;
    wbuffer_wren        = 1'b0;

    // TAG lookup returns, mark corresponding word
    if(check_en_q1) begin
        if(wbuffer_q[check_ptr_q1].valid) begin
            wbuffer_d[check_ptr_q1].checked = 1'b1;
            wbuffer_d[check_ptr_q1].hit_oh = rd_hit_oh_q;
        end
    end

    // if an invalidation or cache line refill comes in and hits on the write buffer,
    // we have to discard our knowledge of the corresponding cacheline state
    for(int k=0; k<DCACHE_WBUF_DEPTH; k++) begin
        if(inval_hit[k]) begin
            wbuffer_d[k].checked = 1'b0;
        end
    end

    // once TX write response came back, we can clear the TX block. if it was not dirty, we
    // can completely evict it - otherwise we have to leave it there for retransmission
    if(evict) begin
        for(int k=0; k<8; k++) begin
            if(tx_stat_q[rtrn_id].be[k]) begin
                wbuffer_d[rtrn_ptr].txblock[k] = 1'b0;
                if(~wbuffer_q[rtrn_ptr].dirty[k]) begin
                    wbuffer_d[rtrn_ptr].valid[k] = 1'b0;

                    // NOTE: this is not strictly needed, but makes it much
                    // easier to debug, since no invalid data remains in the buffer
                    // wbuffer_d[rtrn_ptr].data[k*8 +:8] = '0;
                end
            end
        end
        // if all bytes are evicted, clear the cache status flag
        if(wbuffer_d[rtrn_ptr].valid == 0) begin
            wbuffer_d[rtrn_ptr].checked = 1'b0;
        end
    end

    // mark bytes sent out to the memory system
    if(miss_req_o & miss_ack_i) begin
        dirty_rd_en = 1'b1;
        for(int k=0; k<8; k++) begin
            if(tx_be[k]) begin
                wbuffer_d[dirty_ptr].dirty[k]   = 1'b0;
                wbuffer_d[dirty_ptr].txblock[k] = 1'b1;
            end
        end
    end

    // write new word into the buffer
    if(req_port_i.data_req & rdy) begin
        // in case we have an NC address, need to drain the buffer first
        // in case we are serving an NC address,  we block until it is written to memory
        if(empty_o | ~(addr_is_nc | nc_pending_q)) begin
            wbuffer_wren              = 1'b1;

            req_port_o.data_gnt       = 1'b1;
            nc_pending_d              = addr_is_nc;

            wbuffer_d[wr_ptr].checked = 1'b0;
            wbuffer_d[wr_ptr].wtag    = {req_port_i.address_tag, req_port_i.address_index[DCACHE_INDEX_WIDTH-1:3]};

            // mark bytes as dirty
            for(int k=0; k<8; k++) begin
                if(req_port_i.data_be[k]) begin
                    wbuffer_d[wr_ptr].valid[k]       = 1'b1;
                    wbuffer_d[wr_ptr].dirty[k]       = 1'b1;
                    wbuffer_d[wr_ptr].data[k*8 +: 8] = req_port_i.data_wdata[k*8 +: 8];
                end
            end
        end
    end
end


///////////////////////////////////////////////////////
// ff's
///////////////////////////////////////////////////////

always_ff @(posedge clk_i or negedge rst_ni) begin : p_regs
    if(~rst_ni) begin
        wbuffer_q     <= '{default: '0};
        tx_stat_q     <= '{default: '0};
        nc_pending_q  <= '0;
        check_ptr_q   <= '0;
        check_ptr_q1  <= '0;
        check_en_q    <= '0;
        check_en_q1   <= '0;
        rd_tag_q      <= '0;
        rd_hit_oh_q   <= '0;
        wr_cl_vld_q   <= '0;
        wr_cl_idx_q   <= '0;
    end else begin
        wbuffer_q     <= wbuffer_d;
        tx_stat_q     <= tx_stat_d;
        nc_pending_q  <= nc_pending_d;
        check_ptr_q   <= check_ptr_d;
        check_ptr_q1  <= check_ptr_q;
        check_en_q    <= check_en_d;
        check_en_q1   <= check_en_q;
        rd_tag_q      <= rd_tag_d;
        rd_hit_oh_q   <= rd_hit_oh_d;
        wr_cl_vld_q   <= wr_cl_vld_d;
        wr_cl_idx_q   <= wr_cl_idx_d;
    end
end


///////////////////////////////////////////////////////
// assertions
///////////////////////////////////////////////////////

//pragma translate_off
`ifndef VERILATOR

    hot1: assert property (
        @(posedge clk_i) disable iff (~rst_ni) req_port_i.data_req |-> $onehot0(wbuffer_hit_oh))
            else $fatal(1,"[l1 dcache wbuffer] wbuffer_hit_oh signal must be hot1");

    tx_status: assert property (
        @(posedge clk_i) disable iff (~rst_ni) evict & miss_ack_i & miss_req_o |-> (tx_id != rtrn_id))
            else $fatal(1,"[l1 dcache wbuffer] cannot allocate and clear same tx slot id in the same cycle");

    tx_valid0: assert property (
        @(posedge clk_i) disable iff (~rst_ni) evict |-> tx_stat_q[rtrn_id].vld)
            else $fatal(1,"[l1 dcache wbuffer] evicting invalid transaction slot");

    tx_valid1: assert property (
        @(posedge clk_i) disable iff (~rst_ni) evict |-> |wbuffer_q[rtrn_ptr].valid)
            else $fatal(1,"[l1 dcache wbuffer] wbuffer entry corresponding to this transaction is invalid");

    write_full: assert property (
        @(posedge clk_i) disable iff (~rst_ni) req_port_i.data_req |-> req_port_o.data_gnt |-> ((~full) | (|wbuffer_hit_oh)))
            else $fatal(1,"[l1 dcache wbuffer] cannot write if full or no hit");

    unused0: assert property (
        @(posedge clk_i) disable iff (~rst_ni) ~req_port_i.tag_valid)
            else $fatal(1,"[l1 dcache wbuffer] req_port_i.tag_valid should not be asserted");

    unused1: assert property (
        @(posedge clk_i) disable iff (~rst_ni) ~req_port_i.kill_req)
            else $fatal(1,"[l1 dcache wbuffer] req_port_i.kill_req should not be asserted");

    generate
        for(genvar k=0; k<DCACHE_WBUF_DEPTH; k++) begin
            for(genvar j=0; j<8; j++) begin
                byteStates: assert property (
                    @(posedge clk_i) disable iff (~rst_ni) {wbuffer_q[k].valid[j], wbuffer_q[k].dirty[j], wbuffer_q[k].txblock[j]} inside {3'b000, 3'b110, 3'b101, 3'b111} )
                        else $fatal(1,"[l1 dcache wbuffer] byte %02d of wbuffer entry %02d has invalid state: valid=%01b, dirty=%01b, txblock=%01b",
                            j,k,
                            wbuffer_q[k].valid[j],
                            wbuffer_q[k].dirty[j],
                            wbuffer_q[k].txblock[j]);
            end
        end
    endgenerate
`endif
//pragma translate_on

endmodule // serpent_dcache_wbuffer