| // SPDX-License-Identifier: MIT |
| /* |
| * Copyright © 2020 Intel Corporation |
| */ |
| |
| #include "gen6_engine_cs.h" |
| #include "intel_engine.h" |
| #include "intel_engine_regs.h" |
| #include "intel_gpu_commands.h" |
| #include "intel_gt.h" |
| #include "intel_gt_irq.h" |
| #include "intel_gt_pm_irq.h" |
| #include "intel_ring.h" |
| |
| #define HWS_SCRATCH_ADDR (I915_GEM_HWS_SCRATCH * sizeof(u32)) |
| |
| /* |
| * Emits a PIPE_CONTROL with a non-zero post-sync operation, for |
| * implementing two workarounds on gen6. From section 1.4.7.1 |
| * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1: |
| * |
| * [DevSNB-C+{W/A}] Before any depth stall flush (including those |
| * produced by non-pipelined state commands), software needs to first |
| * send a PIPE_CONTROL with no bits set except Post-Sync Operation != |
| * 0. |
| * |
| * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable |
| * =1, a PIPE_CONTROL with any non-zero post-sync-op is required. |
| * |
| * And the workaround for these two requires this workaround first: |
| * |
| * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent |
| * BEFORE the pipe-control with a post-sync op and no write-cache |
| * flushes. |
| * |
| * And this last workaround is tricky because of the requirements on |
| * that bit. From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM |
| * volume 2 part 1: |
| * |
| * "1 of the following must also be set: |
| * - Render Target Cache Flush Enable ([12] of DW1) |
| * - Depth Cache Flush Enable ([0] of DW1) |
| * - Stall at Pixel Scoreboard ([1] of DW1) |
| * - Depth Stall ([13] of DW1) |
| * - Post-Sync Operation ([13] of DW1) |
| * - Notify Enable ([8] of DW1)" |
| * |
| * The cache flushes require the workaround flush that triggered this |
| * one, so we can't use it. Depth stall would trigger the same. |
| * Post-sync nonzero is what triggered this second workaround, so we |
| * can't use that one either. Notify enable is IRQs, which aren't |
| * really our business. That leaves only stall at scoreboard. |
| */ |
| static int |
| gen6_emit_post_sync_nonzero_flush(struct i915_request *rq) |
| { |
| u32 scratch_addr = |
| intel_gt_scratch_offset(rq->engine->gt, |
| INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH); |
| u32 *cs; |
| |
| cs = intel_ring_begin(rq, 6); |
| if (IS_ERR(cs)) |
| return PTR_ERR(cs); |
| |
| *cs++ = GFX_OP_PIPE_CONTROL(5); |
| *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD; |
| *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT; |
| *cs++ = 0; /* low dword */ |
| *cs++ = 0; /* high dword */ |
| *cs++ = MI_NOOP; |
| intel_ring_advance(rq, cs); |
| |
| cs = intel_ring_begin(rq, 6); |
| if (IS_ERR(cs)) |
| return PTR_ERR(cs); |
| |
| *cs++ = GFX_OP_PIPE_CONTROL(5); |
| *cs++ = PIPE_CONTROL_QW_WRITE; |
| *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT; |
| *cs++ = 0; |
| *cs++ = 0; |
| *cs++ = MI_NOOP; |
| intel_ring_advance(rq, cs); |
| |
| return 0; |
| } |
| |
| int gen6_emit_flush_rcs(struct i915_request *rq, u32 mode) |
| { |
| u32 scratch_addr = |
| intel_gt_scratch_offset(rq->engine->gt, |
| INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH); |
| u32 *cs, flags = 0; |
| int ret; |
| |
| /* Force SNB workarounds for PIPE_CONTROL flushes */ |
| ret = gen6_emit_post_sync_nonzero_flush(rq); |
| if (ret) |
| return ret; |
| |
| /* |
| * Just flush everything. Experiments have shown that reducing the |
| * number of bits based on the write domains has little performance |
| * impact. And when rearranging requests, the order of flushes is |
| * unknown. |
| */ |
| if (mode & EMIT_FLUSH) { |
| flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; |
| flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; |
| /* |
| * Ensure that any following seqno writes only happen |
| * when the render cache is indeed flushed. |
| */ |
| flags |= PIPE_CONTROL_CS_STALL; |
| } |
| if (mode & EMIT_INVALIDATE) { |
| flags |= PIPE_CONTROL_TLB_INVALIDATE; |
| flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; |
| flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; |
| flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; |
| flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; |
| flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; |
| /* |
| * TLB invalidate requires a post-sync write. |
| */ |
| flags |= PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL; |
| } |
| |
| cs = intel_ring_begin(rq, 4); |
| if (IS_ERR(cs)) |
| return PTR_ERR(cs); |
| |
| *cs++ = GFX_OP_PIPE_CONTROL(4); |
| *cs++ = flags; |
| *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT; |
| *cs++ = 0; |
| intel_ring_advance(rq, cs); |
| |
| return 0; |
| } |
| |
| u32 *gen6_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs) |
| { |
| /* First we do the gen6_emit_post_sync_nonzero_flush w/a */ |
| *cs++ = GFX_OP_PIPE_CONTROL(4); |
| *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD; |
| *cs++ = 0; |
| *cs++ = 0; |
| |
| *cs++ = GFX_OP_PIPE_CONTROL(4); |
| *cs++ = PIPE_CONTROL_QW_WRITE; |
| *cs++ = intel_gt_scratch_offset(rq->engine->gt, |
| INTEL_GT_SCRATCH_FIELD_DEFAULT) | |
| PIPE_CONTROL_GLOBAL_GTT; |
| *cs++ = 0; |
| |
| /* Finally we can flush and with it emit the breadcrumb */ |
| *cs++ = GFX_OP_PIPE_CONTROL(4); |
| *cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | |
| PIPE_CONTROL_DEPTH_CACHE_FLUSH | |
| PIPE_CONTROL_DC_FLUSH_ENABLE | |
| PIPE_CONTROL_QW_WRITE | |
| PIPE_CONTROL_CS_STALL); |
| *cs++ = i915_request_active_seqno(rq) | |
| PIPE_CONTROL_GLOBAL_GTT; |
| *cs++ = rq->fence.seqno; |
| |
| *cs++ = MI_USER_INTERRUPT; |
| *cs++ = MI_NOOP; |
| |
| rq->tail = intel_ring_offset(rq, cs); |
| assert_ring_tail_valid(rq->ring, rq->tail); |
| |
| return cs; |
| } |
| |
| static int mi_flush_dw(struct i915_request *rq, u32 flags) |
| { |
| u32 cmd, *cs; |
| |
| cs = intel_ring_begin(rq, 4); |
| if (IS_ERR(cs)) |
| return PTR_ERR(cs); |
| |
| cmd = MI_FLUSH_DW; |
| |
| /* |
| * We always require a command barrier so that subsequent |
| * commands, such as breadcrumb interrupts, are strictly ordered |
| * wrt the contents of the write cache being flushed to memory |
| * (and thus being coherent from the CPU). |
| */ |
| cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; |
| |
| /* |
| * Bspec vol 1c.3 - blitter engine command streamer: |
| * "If ENABLED, all TLBs will be invalidated once the flush |
| * operation is complete. This bit is only valid when the |
| * Post-Sync Operation field is a value of 1h or 3h." |
| */ |
| cmd |= flags; |
| |
| *cs++ = cmd; |
| *cs++ = HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT; |
| *cs++ = 0; |
| *cs++ = MI_NOOP; |
| |
| intel_ring_advance(rq, cs); |
| |
| return 0; |
| } |
| |
| static int gen6_flush_dw(struct i915_request *rq, u32 mode, u32 invflags) |
| { |
| return mi_flush_dw(rq, mode & EMIT_INVALIDATE ? invflags : 0); |
| } |
| |
| int gen6_emit_flush_xcs(struct i915_request *rq, u32 mode) |
| { |
| return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB); |
| } |
| |
| int gen6_emit_flush_vcs(struct i915_request *rq, u32 mode) |
| { |
| return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB | MI_INVALIDATE_BSD); |
| } |
| |
| int gen6_emit_bb_start(struct i915_request *rq, |
| u64 offset, u32 len, |
| unsigned int dispatch_flags) |
| { |
| u32 security; |
| u32 *cs; |
| |
| security = MI_BATCH_NON_SECURE_I965; |
| if (dispatch_flags & I915_DISPATCH_SECURE) |
| security = 0; |
| |
| cs = intel_ring_begin(rq, 2); |
| if (IS_ERR(cs)) |
| return PTR_ERR(cs); |
| |
| cs = __gen6_emit_bb_start(cs, offset, security); |
| intel_ring_advance(rq, cs); |
| |
| return 0; |
| } |
| |
| int |
| hsw_emit_bb_start(struct i915_request *rq, |
| u64 offset, u32 len, |
| unsigned int dispatch_flags) |
| { |
| u32 security; |
| u32 *cs; |
| |
| security = MI_BATCH_PPGTT_HSW | MI_BATCH_NON_SECURE_HSW; |
| if (dispatch_flags & I915_DISPATCH_SECURE) |
| security = 0; |
| |
| cs = intel_ring_begin(rq, 2); |
| if (IS_ERR(cs)) |
| return PTR_ERR(cs); |
| |
| cs = __gen6_emit_bb_start(cs, offset, security); |
| intel_ring_advance(rq, cs); |
| |
| return 0; |
| } |
| |
| static int gen7_stall_cs(struct i915_request *rq) |
| { |
| u32 *cs; |
| |
| cs = intel_ring_begin(rq, 4); |
| if (IS_ERR(cs)) |
| return PTR_ERR(cs); |
| |
| *cs++ = GFX_OP_PIPE_CONTROL(4); |
| *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD; |
| *cs++ = 0; |
| *cs++ = 0; |
| intel_ring_advance(rq, cs); |
| |
| return 0; |
| } |
| |
| int gen7_emit_flush_rcs(struct i915_request *rq, u32 mode) |
| { |
| u32 scratch_addr = |
| intel_gt_scratch_offset(rq->engine->gt, |
| INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH); |
| u32 *cs, flags = 0; |
| |
| /* |
| * Ensure that any following seqno writes only happen when the render |
| * cache is indeed flushed. |
| * |
| * Workaround: 4th PIPE_CONTROL command (except the ones with only |
| * read-cache invalidate bits set) must have the CS_STALL bit set. We |
| * don't try to be clever and just set it unconditionally. |
| */ |
| flags |= PIPE_CONTROL_CS_STALL; |
| |
| /* |
| * CS_STALL suggests at least a post-sync write. |
| */ |
| flags |= PIPE_CONTROL_QW_WRITE; |
| flags |= PIPE_CONTROL_GLOBAL_GTT_IVB; |
| |
| /* |
| * Just flush everything. Experiments have shown that reducing the |
| * number of bits based on the write domains has little performance |
| * impact. |
| */ |
| if (mode & EMIT_FLUSH) { |
| flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; |
| flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; |
| flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; |
| flags |= PIPE_CONTROL_FLUSH_ENABLE; |
| } |
| if (mode & EMIT_INVALIDATE) { |
| flags |= PIPE_CONTROL_TLB_INVALIDATE; |
| flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; |
| flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; |
| flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; |
| flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; |
| flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; |
| flags |= PIPE_CONTROL_MEDIA_STATE_CLEAR; |
| |
| /* |
| * Workaround: we must issue a pipe_control with CS-stall bit |
| * set before a pipe_control command that has the state cache |
| * invalidate bit set. |
| */ |
| gen7_stall_cs(rq); |
| } |
| |
| cs = intel_ring_begin(rq, 4); |
| if (IS_ERR(cs)) |
| return PTR_ERR(cs); |
| |
| *cs++ = GFX_OP_PIPE_CONTROL(4); |
| *cs++ = flags; |
| *cs++ = scratch_addr; |
| *cs++ = 0; |
| intel_ring_advance(rq, cs); |
| |
| return 0; |
| } |
| |
| u32 *gen7_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs) |
| { |
| *cs++ = GFX_OP_PIPE_CONTROL(4); |
| *cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | |
| PIPE_CONTROL_DEPTH_CACHE_FLUSH | |
| PIPE_CONTROL_DC_FLUSH_ENABLE | |
| PIPE_CONTROL_FLUSH_ENABLE | |
| PIPE_CONTROL_QW_WRITE | |
| PIPE_CONTROL_GLOBAL_GTT_IVB | |
| PIPE_CONTROL_CS_STALL); |
| *cs++ = i915_request_active_seqno(rq); |
| *cs++ = rq->fence.seqno; |
| |
| *cs++ = MI_USER_INTERRUPT; |
| *cs++ = MI_NOOP; |
| |
| rq->tail = intel_ring_offset(rq, cs); |
| assert_ring_tail_valid(rq->ring, rq->tail); |
| |
| return cs; |
| } |
| |
| u32 *gen6_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs) |
| { |
| GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma); |
| GEM_BUG_ON(offset_in_page(rq->hwsp_seqno) != I915_GEM_HWS_SEQNO_ADDR); |
| |
| *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX; |
| *cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT; |
| *cs++ = rq->fence.seqno; |
| |
| *cs++ = MI_USER_INTERRUPT; |
| |
| rq->tail = intel_ring_offset(rq, cs); |
| assert_ring_tail_valid(rq->ring, rq->tail); |
| |
| return cs; |
| } |
| |
| #define GEN7_XCS_WA 32 |
| u32 *gen7_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs) |
| { |
| int i; |
| |
| GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma); |
| GEM_BUG_ON(offset_in_page(rq->hwsp_seqno) != I915_GEM_HWS_SEQNO_ADDR); |
| |
| *cs++ = MI_FLUSH_DW | MI_INVALIDATE_TLB | |
| MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX; |
| *cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT; |
| *cs++ = rq->fence.seqno; |
| |
| for (i = 0; i < GEN7_XCS_WA; i++) { |
| *cs++ = MI_STORE_DWORD_INDEX; |
| *cs++ = I915_GEM_HWS_SEQNO_ADDR; |
| *cs++ = rq->fence.seqno; |
| } |
| |
| *cs++ = MI_FLUSH_DW; |
| *cs++ = 0; |
| *cs++ = 0; |
| |
| *cs++ = MI_USER_INTERRUPT; |
| *cs++ = MI_NOOP; |
| |
| rq->tail = intel_ring_offset(rq, cs); |
| assert_ring_tail_valid(rq->ring, rq->tail); |
| |
| return cs; |
| } |
| #undef GEN7_XCS_WA |
| |
| void gen6_irq_enable(struct intel_engine_cs *engine) |
| { |
| ENGINE_WRITE(engine, RING_IMR, |
| ~(engine->irq_enable_mask | engine->irq_keep_mask)); |
| |
| /* Flush/delay to ensure the RING_IMR is active before the GT IMR */ |
| ENGINE_POSTING_READ(engine, RING_IMR); |
| |
| gen5_gt_enable_irq(engine->gt, engine->irq_enable_mask); |
| } |
| |
| void gen6_irq_disable(struct intel_engine_cs *engine) |
| { |
| ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask); |
| gen5_gt_disable_irq(engine->gt, engine->irq_enable_mask); |
| } |
| |
| void hsw_irq_enable_vecs(struct intel_engine_cs *engine) |
| { |
| ENGINE_WRITE(engine, RING_IMR, ~engine->irq_enable_mask); |
| |
| /* Flush/delay to ensure the RING_IMR is active before the GT IMR */ |
| ENGINE_POSTING_READ(engine, RING_IMR); |
| |
| gen6_gt_pm_unmask_irq(engine->gt, engine->irq_enable_mask); |
| } |
| |
| void hsw_irq_disable_vecs(struct intel_engine_cs *engine) |
| { |
| ENGINE_WRITE(engine, RING_IMR, ~0); |
| gen6_gt_pm_mask_irq(engine->gt, engine->irq_enable_mask); |
| } |