/* ============================================================================ (C) 2007 Robert T Finch All rights reserved. rob@birdcomputer.ca mmu.v Remaps (translates) a virtual address to a real address. The last gigabyte of the address range is not translated. This source code is available for evaluation and validation purposes only. This copyright statement and disclaimer must remain present in the file. NO WARRANTY. THIS Work, IS PROVIDEDED "AS IS" WITH NO WARRANTIES OF ANY KIND, WHETHER EXPRESS OR IMPLIED. The user must assume the entire risk of using the Work. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR ANY INCIDENTAL, CONSEQUENTIAL, OR PUNITIVE DAMAGES WHATSOEVER RELATING TO THE USE OF THIS WORK, OR YOUR RELATIONSHIP WITH THE AUTHOR. IN ADDITION, IN NO EVENT DOES THE AUTHOR AUTHORIZE YOU TO USE THE WORK IN APPLICATIONS OR SYSTEMS WHERE THE WORK'S FAILURE TO PERFORM CAN REASONABLY BE EXPECTED TO RESULT IN A SIGNIFICANT PHYSICAL INJURY, OR IN LOSS OF LIFE. ANY SUCH USE BY YOU IS ENTIRELY AT YOUR OWN RISK, AND YOU AGREE TO HOLD THE AUTHOR AND CONTRIBUTORS HARMLESS FROM ANY CLAIMS OR LOSSES RELATING TO SUCH UNAUTHORIZED USE. Webpack 8.2 xc3s1000-4ft256 399 slices / 811 LUTs / 77 MHz ============================================================================ */ module mmu #( parameter WID = 32, pAssociativity = 4, // number of ways (parallel compares) pTLB_size = 16, S_WAIT_MISS = 0, S_RD_PTL1_ACK = 1, S_RD_PTL0 = 2, S_RD_PTL0_ACK = 3, S_WR_PTL0 = 4, S_WR_PTL0_ACK = 5 ) ( // syscon input rst_i, input clk_i, // master output reg soc_o, // start of cycle output reg cyc_o, // bus cycle active output reg lock_o, // lock the bus input ack_i, // acknowledge from memory system output reg wr_o, // write enable output output [ 3:0] byt_o, // lane selects (always all active) output reg [31:0] adr_o, input [31:0] dat_i, // data input from memory output reg [31:0] dat_o, // data to memory // Translation request / control input invalidate, // invalidate a specific entry input invalidate_all, // causes all entries to be invalidated input [31:0] pta, // page directory/table address register input wr, // cpu is performing write cycle input [31:0] vadr, // virtual address to translate output reg [31:0] tadr, // translated address output rdy, // address translation is ready output reg s,c,r,w,x, // supervisor, cacheable, read, write and execute attributes output reg v // translation is valid ); assign byt_o = 4'b1111; // always reading / writing all lanes integer nn; reg [2:0] nnx; reg [31:0] pte; // holding place for data reg [2:0] state; reg [31:0] prev_pta; // previous page table address reg [1:0] cnt; // tlb replacement counter reg [1:0] whichSet; // which set to update reg dbit; // temp dirty bit reg miss; assign rdy = !miss; wire [pAssociativity-1:0] tlb_d; wire [ 4: 0] tlb_flags [pAssociativity-1:0]; wire [31:16] tlb_vadr [pAssociativity-1:0]; wire [31:12] tlb_tadr [pAssociativity-1:0]; wire wr_tlb = state==S_WR_PTL0; genvar g; generate for (g = 0; g < pAssociativity; g = g + 1) begin : genTLB arRam1rw1r #(16,pTLB_size) tlbVadr (.clk(clk_i), .ce(whichSet==g), .wr(wr_tlb), .rwa(vadr[15:12]), .ra(vadr[15:12]), .i(vadr [31:16]), .rwo(), .ro(tlb_vadr[g]) ); arRam1rw1r #(20,pTLB_size) tlbTadr (.clk(clk_i), .ce(whichSet==g), .wr(wr_tlb), .rwa(vadr[15:12]), .ra(vadr[15:12]), .i(dat_i[31:12]), .rwo(), .ro(tlb_tadr[g]) ); arRam1rw1r #( 5,pTLB_size) tlbFlag (.clk(clk_i), .ce(whichSet==g), .wr(wr_tlb), .rwa(vadr[15:12]), .ra(vadr[15:12]), .i(dat_i[ 4: 0]), .rwo(), .ro(tlb_flags[g]) ); arRam1rw1r #( 1,pTLB_size) tlbD (.clk(clk_i), .ce(wr_tlb?whichSet==g:nnx==g), .wr(wr_tlb||state==S_WAIT_MISS && wr && !miss), .rwa(vadr[15:12]), .ra(vadr[15:12]), .i(!wr_tlb), .rwo(), .ro(tlb_d[g]) ); end endgenerate reg [pAssociativity*pTLB_size-1:0] tlb_v; // valid // The following reg allows detection of when the page table address changes always @(posedge clk_i) if (rst_i) prev_pta <= 0; else prev_pta <= pta; // This must be fast !!! // Lookup the virtual address in the tlb // Translate the address // I/O and system BIOS addresses are not mapped // Cxxx_xxxx to FFFF_FFFF not mapped (kernel segment) always @(vadr,tlb_v) begin miss <= 1; nnx <= pAssociativity; s <= 1; c <= 1; r <= 1; x <= 1; w <= 1; v <= 0; tadr[11: 0] <= vadr[11: 0]; tadr[31:12] <= vadr[31:12]; if (&vadr[31:30]) begin miss <= 0; c <= vadr[29:28]==2'b00; // C000_0000 to CFFF_FFFF is cacheable v <= 1; end else begin for (nn = 0; nn < pAssociativity; nn = nn + 1) if (tlb_v[{nn,vadr[15:12]}] && vadr[31:16]==tlb_vadr[nn]) begin tadr[31:12] <= tlb_tadr[nn]; miss <= 1'b0; nnx <= nn; s <= tlb_flags[nn][4]; c <= tlb_flags[nn][3]; r <= tlb_flags[nn][2]; x <= tlb_flags[nn][1]; w <= tlb_flags[nn][0]; v <= tlb_flags[nn][2]|tlb_flags[nn][1]|tlb_flags[nn][0]; end end end // The following state machine loads the tlb buffer on a // miss. always @(posedge clk_i) if (rst_i) begin soc_o <= 0; cyc_o <= 0; lock_o <= 0; wr_o <= 0; adr_o <= 0; state <= 0; dbit <= 0; whichSet <= 0; for (nn = 0; nn < pAssociativity * pTLB_size; nn = nn + 1) tlb_v[nn] <= 0; // all entries are invalid on reset end else begin soc_o <= 0; // changing the address of the page table invalidates all entries if (invalidate_all || prev_pta != pta) for (nn = 0; nn < pAssociativity * pTLB_size; nn = nn + 1) tlb_v[nn] <= 0; // handle invalidate command if (invalidate) for (nn = 0; nn < pAssociativity; nn = nn + 1) if (vadr[31:16]==tlb_vadr[nn]) tlb_v[{nn,vadr[15:12]}] <= 0; case (state) // synopsys full_case parallel_case // Wait for a miss to occur. then initiate bus cycle // Output either the page directory address // or the page table address, depending on the // size of the app. S_WAIT_MISS: begin state <= S_WAIT_MISS; dbit <= wr; // Set page table address for lookup if (pta[0]) adr_o <= {pta[WID-1:12],vadr[31:22],2'b0}; else adr_o <= {pta[WID-1:12],vadr[21:12],2'b0}; if (miss) begin // try and pick an empty tlb entry whichSet <= cnt; for (nn = 0; nn < pAssociativity; nn = nn + 1) if (!tlb_v[{nn,vadr[15:12]}]) whichSet <= nn; soc_o <= 1; cyc_o <= 1; lock_o <= 0; wr_o <= 0; if (pta[0]) state <= S_RD_PTL1_ACK; else state <= S_RD_PTL0_ACK; end // If there's a write cycle, check to see if the // dirty bit is set. If the dirty bit hasn't been // set yet, then set it and write the dirty status // to memory. else if (wr && !tlb_d[nnx]) begin whichSet <= nnx; soc_o <= 1; cyc_o <= 1; lock_o <= 0; wr_o <= 0; if (pta[0]) state <= S_RD_PTL1_ACK; else state <= S_RD_PTL0_ACK; end end // Wait for ack from system // Setup to access page table // If app uses a page directory, now address the page table S_RD_PTL1_ACK: if (ack_i) begin cyc_o <= 0; if (|dat_i[2:0]) begin // pte valid bit adr_o <= {dat_i[WID-1:12],vadr[21:12],2'b0}; state <= S_RD_PTL0; end //else // not a valid translation // OS messed up ? end //--------------------------------------------------- // This section of the state machine performs a // read then write of a PTE //--------------------------------------------------- // Perform a read cycle of page table level 0 entry S_RD_PTL0: begin soc_o <= 1; cyc_o <= 1; lock_o <= 1; wr_o <= 0; state <= S_RD_PTL0_ACK; end S_RD_PTL0_ACK: if (ack_i) begin cyc_o <= 0; pte <= dat_i; state <= S_WR_PTL0; end // The tlb has been updated so the page must have been accessed // set the accessed bit for the page table entry // Also set dirty bit if a write access. S_WR_PTL0: begin soc_o <= 1; cyc_o <= 1; wr_o <= 1; dat_o <= pte|{1'b1,dbit,7'b0}; state <= S_WR_PTL0_ACK; end S_WR_PTL0_ACK: if (ack_i) begin cyc_o <= 0; lock_o <= 0; wr_o <= 0; tlb_v[{whichSet,vadr[15:12]}] <= |pte[2:0]; state <= S_WAIT_MISS; end //--------------------------------------------------- // This state can't happen without a hardware error //--------------------------------------------------- default: begin soc_o <= 0; cyc_o <= 0; lock_o <= 0; wr_o <= 0; state <= S_WAIT_MISS; end endcase end // This counter is used to select the tlb entry that gets // replaced when a new entry is entered into the buffer. // It just increments every time an entry is updated. always @(posedge clk_i) if (rst_i) cnt <= 0; else if (state==S_WAIT_MISS && miss) begin if (cnt == pAssociativity-1) cnt <= 0; else cnt <= cnt + 1; end endmodule