drivers/gpu/drm/i915/gt/gen6_engine_cs.c - linux - Git at Google

 // SPDX-License-Identifier: MIT
 /*
  * Copyright © 2020 Intel Corporation
  */

 #include "gen6_engine_cs.h"
 #include "intel_engine.h"
 #include "intel_gpu_commands.h"
 #include "intel_gt.h"
 #include "intel_gt_irq.h"
 #include "intel_gt_pm_irq.h"
 #include "intel_ring.h"

 #define HWS_SCRATCH_ADDR	(I915_GEM_HWS_SCRATCH * sizeof(u32))

 /*
  * Emits a PIPE_CONTROL with a non-zero post-sync operation, for
  * implementing two workarounds on gen6.  From section 1.4.7.1
  * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
  *
  * [DevSNB-C+{W/A}] Before any depth stall flush (including those
  * produced by non-pipelined state commands), software needs to first
  * send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
  * 0.
  *
  * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
  * =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
  *
  * And the workaround for these two requires this workaround first:
  *
  * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
  * BEFORE the pipe-control with a post-sync op and no write-cache
  * flushes.
  *
  * And this last workaround is tricky because of the requirements on
  * that bit.  From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
  * volume 2 part 1:
  *
  *     "1 of the following must also be set:
  *      - Render Target Cache Flush Enable ([12] of DW1)
  *      - Depth Cache Flush Enable ([0] of DW1)
  *      - Stall at Pixel Scoreboard ([1] of DW1)
  *      - Depth Stall ([13] of DW1)
  *      - Post-Sync Operation ([13] of DW1)
  *      - Notify Enable ([8] of DW1)"
  *
  * The cache flushes require the workaround flush that triggered this
  * one, so we can't use it.  Depth stall would trigger the same.
  * Post-sync nonzero is what triggered this second workaround, so we
  * can't use that one either.  Notify enable is IRQs, which aren't
  * really our business.  That leaves only stall at scoreboard.
  */
 static int
 gen6_emit_post_sync_nonzero_flush(struct i915_request *rq)
 {
 	u32 scratch_addr =
 		intel_gt_scratch_offset(rq->engine->gt,
 					INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
 	u32 *cs;

 	cs = intel_ring_begin(rq, 6);
 	if (IS_ERR(cs))
 		return PTR_ERR(cs);

 	*cs++ = GFX_OP_PIPE_CONTROL(5);
 	*cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
 	*cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
 	*cs++ = 0; /* low dword */
 	*cs++ = 0; /* high dword */
 	*cs++ = MI_NOOP;
 	intel_ring_advance(rq, cs);

 	cs = intel_ring_begin(rq, 6);
 	if (IS_ERR(cs))
 		return PTR_ERR(cs);

 	*cs++ = GFX_OP_PIPE_CONTROL(5);
 	*cs++ = PIPE_CONTROL_QW_WRITE;
 	*cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
 	*cs++ = 0;
 	*cs++ = 0;
 	*cs++ = MI_NOOP;
 	intel_ring_advance(rq, cs);

 	return 0;
 }

 int gen6_emit_flush_rcs(struct i915_request *rq, u32 mode)
 {
 	u32 scratch_addr =
 		intel_gt_scratch_offset(rq->engine->gt,
 					INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
 	u32 *cs, flags = 0;
 	int ret;

 	/* Force SNB workarounds for PIPE_CONTROL flushes */
 	ret = gen6_emit_post_sync_nonzero_flush(rq);
 	if (ret)
 		return ret;

 	/*
 	 * Just flush everything.  Experiments have shown that reducing the
 	 * number of bits based on the write domains has little performance
 	 * impact. And when rearranging requests, the order of flushes is
 	 * unknown.
 	 */
 	if (mode & EMIT_FLUSH) {
 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
 		/*
 		 * Ensure that any following seqno writes only happen
 		 * when the render cache is indeed flushed.
 		 */
 		flags |= PIPE_CONTROL_CS_STALL;
 	}
 	if (mode & EMIT_INVALIDATE) {
 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
 		/*
 		 * TLB invalidate requires a post-sync write.
 		 */
 		flags |= PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL;
 	}

 	cs = intel_ring_begin(rq, 4);
 	if (IS_ERR(cs))
 		return PTR_ERR(cs);

 	*cs++ = GFX_OP_PIPE_CONTROL(4);
 	*cs++ = flags;
 	*cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
 	*cs++ = 0;
 	intel_ring_advance(rq, cs);

 	return 0;
 }

 u32 *gen6_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
 {
 	/* First we do the gen6_emit_post_sync_nonzero_flush w/a */
 	*cs++ = GFX_OP_PIPE_CONTROL(4);
 	*cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
 	*cs++ = 0;
 	*cs++ = 0;

 	*cs++ = GFX_OP_PIPE_CONTROL(4);
 	*cs++ = PIPE_CONTROL_QW_WRITE;
 	*cs++ = intel_gt_scratch_offset(rq->engine->gt,
 					INTEL_GT_SCRATCH_FIELD_DEFAULT) |
 		PIPE_CONTROL_GLOBAL_GTT;
 	*cs++ = 0;

 	/* Finally we can flush and with it emit the breadcrumb */
 	*cs++ = GFX_OP_PIPE_CONTROL(4);
 	*cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
 		 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
 		 PIPE_CONTROL_DC_FLUSH_ENABLE |
 		 PIPE_CONTROL_QW_WRITE |
 		 PIPE_CONTROL_CS_STALL);
 	*cs++ = i915_request_active_seqno(rq) |
 		PIPE_CONTROL_GLOBAL_GTT;
 	*cs++ = rq->fence.seqno;

 	*cs++ = MI_USER_INTERRUPT;
 	*cs++ = MI_NOOP;

 	rq->tail = intel_ring_offset(rq, cs);
 	assert_ring_tail_valid(rq->ring, rq->tail);

 	return cs;
 }

 static int mi_flush_dw(struct i915_request *rq, u32 flags)
 {
 	u32 cmd, *cs;

 	cs = intel_ring_begin(rq, 4);
 	if (IS_ERR(cs))
 		return PTR_ERR(cs);

 	cmd = MI_FLUSH_DW;

 	/*
 	 * We always require a command barrier so that subsequent
 	 * commands, such as breadcrumb interrupts, are strictly ordered
 	 * wrt the contents of the write cache being flushed to memory
 	 * (and thus being coherent from the CPU).
 	 */
 	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;

 	/*
 	 * Bspec vol 1c.3 - blitter engine command streamer:
 	 * "If ENABLED, all TLBs will be invalidated once the flush
 	 * operation is complete. This bit is only valid when the
 	 * Post-Sync Operation field is a value of 1h or 3h."
 	 */
 	cmd |= flags;

 	*cs++ = cmd;
 	*cs++ = HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
 	*cs++ = 0;
 	*cs++ = MI_NOOP;

 	intel_ring_advance(rq, cs);

 	return 0;
 }

 static int gen6_flush_dw(struct i915_request *rq, u32 mode, u32 invflags)
 {
 	return mi_flush_dw(rq, mode & EMIT_INVALIDATE ? invflags : 0);
 }

 int gen6_emit_flush_xcs(struct i915_request *rq, u32 mode)
 {
 	return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB);
 }

 int gen6_emit_flush_vcs(struct i915_request *rq, u32 mode)
 {
 	return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB | MI_INVALIDATE_BSD);
 }

 int gen6_emit_bb_start(struct i915_request *rq,
 		       u64 offset, u32 len,
 		       unsigned int dispatch_flags)
 {
 	u32 security;
 	u32 *cs;

 	security = MI_BATCH_NON_SECURE_I965;
 	if (dispatch_flags & I915_DISPATCH_SECURE)
 		security = 0;

 	cs = intel_ring_begin(rq, 2);
 	if (IS_ERR(cs))
 		return PTR_ERR(cs);

 	cs = __gen6_emit_bb_start(cs, offset, security);
 	intel_ring_advance(rq, cs);

 	return 0;
 }

 int
 hsw_emit_bb_start(struct i915_request *rq,
 		  u64 offset, u32 len,
 		  unsigned int dispatch_flags)
 {
 	u32 security;
 	u32 *cs;

 	security = MI_BATCH_PPGTT_HSW | MI_BATCH_NON_SECURE_HSW;
 	if (dispatch_flags & I915_DISPATCH_SECURE)
 		security = 0;

 	cs = intel_ring_begin(rq, 2);
 	if (IS_ERR(cs))
 		return PTR_ERR(cs);

 	cs = __gen6_emit_bb_start(cs, offset, security);
 	intel_ring_advance(rq, cs);

 	return 0;
 }

 static int gen7_stall_cs(struct i915_request *rq)
 {
 	u32 *cs;

 	cs = intel_ring_begin(rq, 4);
 	if (IS_ERR(cs))
 		return PTR_ERR(cs);

 	*cs++ = GFX_OP_PIPE_CONTROL(4);
 	*cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
 	*cs++ = 0;
 	*cs++ = 0;
 	intel_ring_advance(rq, cs);

 	return 0;
 }

 int gen7_emit_flush_rcs(struct i915_request *rq, u32 mode)
 {
 	u32 scratch_addr =
 		intel_gt_scratch_offset(rq->engine->gt,
 					INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
 	u32 *cs, flags = 0;

 	/*
 	 * Ensure that any following seqno writes only happen when the render
 	 * cache is indeed flushed.
 	 *
 	 * Workaround: 4th PIPE_CONTROL command (except the ones with only
 	 * read-cache invalidate bits set) must have the CS_STALL bit set. We
 	 * don't try to be clever and just set it unconditionally.
 	 */
 	flags |= PIPE_CONTROL_CS_STALL;

 	/*
 	 * CS_STALL suggests at least a post-sync write.
 	 */
 	flags |= PIPE_CONTROL_QW_WRITE;
 	flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;

 	/*
 	 * Just flush everything.  Experiments have shown that reducing the
 	 * number of bits based on the write domains has little performance
 	 * impact.
 	 */
 	if (mode & EMIT_FLUSH) {
 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
 	}
 	if (mode & EMIT_INVALIDATE) {
 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
 		flags |= PIPE_CONTROL_MEDIA_STATE_CLEAR;

 		/*
 		 * Workaround: we must issue a pipe_control with CS-stall bit
 		 * set before a pipe_control command that has the state cache
 		 * invalidate bit set.
 		 */
 		gen7_stall_cs(rq);
 	}

 	cs = intel_ring_begin(rq, 4);
 	if (IS_ERR(cs))
 		return PTR_ERR(cs);

 	*cs++ = GFX_OP_PIPE_CONTROL(4);
 	*cs++ = flags;
 	*cs++ = scratch_addr;
 	*cs++ = 0;
 	intel_ring_advance(rq, cs);

 	return 0;
 }

 u32 *gen7_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
 {
 	*cs++ = GFX_OP_PIPE_CONTROL(4);
 	*cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
 		 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
 		 PIPE_CONTROL_DC_FLUSH_ENABLE |
 		 PIPE_CONTROL_FLUSH_ENABLE |
 		 PIPE_CONTROL_QW_WRITE |
 		 PIPE_CONTROL_GLOBAL_GTT_IVB |
 		 PIPE_CONTROL_CS_STALL);
 	*cs++ = i915_request_active_seqno(rq);
 	*cs++ = rq->fence.seqno;

 	*cs++ = MI_USER_INTERRUPT;
 	*cs++ = MI_NOOP;

 	rq->tail = intel_ring_offset(rq, cs);
 	assert_ring_tail_valid(rq->ring, rq->tail);

 	return cs;
 }

 u32 *gen6_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
 {
 	GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
 	GEM_BUG_ON(offset_in_page(rq->hwsp_seqno) != I915_GEM_HWS_SEQNO_ADDR);

 	*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
 	*cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
 	*cs++ = rq->fence.seqno;

 	*cs++ = MI_USER_INTERRUPT;

 	rq->tail = intel_ring_offset(rq, cs);
 	assert_ring_tail_valid(rq->ring, rq->tail);

 	return cs;
 }

 #define GEN7_XCS_WA 32
 u32 *gen7_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
 {
 	int i;

 	GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
 	GEM_BUG_ON(offset_in_page(rq->hwsp_seqno) != I915_GEM_HWS_SEQNO_ADDR);

 	*cs++ = MI_FLUSH_DW | MI_INVALIDATE_TLB |
 		MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
 	*cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
 	*cs++ = rq->fence.seqno;

 	for (i = 0; i < GEN7_XCS_WA; i++) {
 		*cs++ = MI_STORE_DWORD_INDEX;
 		*cs++ = I915_GEM_HWS_SEQNO_ADDR;
 		*cs++ = rq->fence.seqno;
 	}

 	*cs++ = MI_FLUSH_DW;
 	*cs++ = 0;
 	*cs++ = 0;

 	*cs++ = MI_USER_INTERRUPT;
 	*cs++ = MI_NOOP;

 	rq->tail = intel_ring_offset(rq, cs);
 	assert_ring_tail_valid(rq->ring, rq->tail);

 	return cs;
 }
 #undef GEN7_XCS_WA

 void gen6_irq_enable(struct intel_engine_cs *engine)
 {
 	ENGINE_WRITE(engine, RING_IMR,
 		     ~(engine->irq_enable_mask | engine->irq_keep_mask));

 	/* Flush/delay to ensure the RING_IMR is active before the GT IMR */
 	ENGINE_POSTING_READ(engine, RING_IMR);

 	gen5_gt_enable_irq(engine->gt, engine->irq_enable_mask);
 }

 void gen6_irq_disable(struct intel_engine_cs *engine)
 {
 	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
 	gen5_gt_disable_irq(engine->gt, engine->irq_enable_mask);
 }

 void hsw_irq_enable_vecs(struct intel_engine_cs *engine)
 {
 	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_enable_mask);

 	/* Flush/delay to ensure the RING_IMR is active before the GT IMR */
 	ENGINE_POSTING_READ(engine, RING_IMR);

 	gen6_gt_pm_unmask_irq(engine->gt, engine->irq_enable_mask);
 }

 void hsw_irq_disable_vecs(struct intel_engine_cs *engine)
 {
 	ENGINE_WRITE(engine, RING_IMR, ~0);
 	gen6_gt_pm_mask_irq(engine->gt, engine->irq_enable_mask);
 }
	// SPDX-License-Identifier: MIT
	/*
	* Copyright © 2020 Intel Corporation
	*/

	#include "gen6_engine_cs.h"
	#include "intel_engine.h"
	#include "intel_gpu_commands.h"
	#include "intel_gt.h"
	#include "intel_gt_irq.h"
	#include "intel_gt_pm_irq.h"
	#include "intel_ring.h"

	#define HWS_SCRATCH_ADDR (I915_GEM_HWS_SCRATCH * sizeof(u32))

	/*
	* Emits a PIPE_CONTROL with a non-zero post-sync operation, for
	* implementing two workarounds on gen6. From section 1.4.7.1
	* "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
	*
	* [DevSNB-C+{W/A}] Before any depth stall flush (including those
	* produced by non-pipelined state commands), software needs to first
	* send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
	* 0.
	*
	* [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
	* =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
	*
	* And the workaround for these two requires this workaround first:
	*
	* [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
	* BEFORE the pipe-control with a post-sync op and no write-cache
	* flushes.
	*
	* And this last workaround is tricky because of the requirements on
	* that bit. From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
	* volume 2 part 1:
	*
	* "1 of the following must also be set:
	* - Render Target Cache Flush Enable ([12] of DW1)
	* - Depth Cache Flush Enable ([0] of DW1)
	* - Stall at Pixel Scoreboard ([1] of DW1)
	* - Depth Stall ([13] of DW1)
	* - Post-Sync Operation ([13] of DW1)
	* - Notify Enable ([8] of DW1)"
	*
	* The cache flushes require the workaround flush that triggered this
	* one, so we can't use it. Depth stall would trigger the same.
	* Post-sync nonzero is what triggered this second workaround, so we
	* can't use that one either. Notify enable is IRQs, which aren't
	* really our business. That leaves only stall at scoreboard.
	*/
	static int
	gen6_emit_post_sync_nonzero_flush(struct i915_request *rq)
	{
	u32 scratch_addr =
	intel_gt_scratch_offset(rq->engine->gt,
	INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
	u32 *cs;

	cs = intel_ring_begin(rq, 6);
	if (IS_ERR(cs))
	return PTR_ERR(cs);

	*cs++ = GFX_OP_PIPE_CONTROL(5);
	*cs++ = PIPE_CONTROL_CS_STALL \| PIPE_CONTROL_STALL_AT_SCOREBOARD;
	*cs++ = scratch_addr \| PIPE_CONTROL_GLOBAL_GTT;
	cs++ = 0; / low dword */
	cs++ = 0; / high dword */
	*cs++ = MI_NOOP;
	intel_ring_advance(rq, cs);

	cs = intel_ring_begin(rq, 6);
	if (IS_ERR(cs))
	return PTR_ERR(cs);

	*cs++ = GFX_OP_PIPE_CONTROL(5);
	*cs++ = PIPE_CONTROL_QW_WRITE;
	*cs++ = scratch_addr \| PIPE_CONTROL_GLOBAL_GTT;
	*cs++ = 0;
	*cs++ = 0;
	*cs++ = MI_NOOP;
	intel_ring_advance(rq, cs);

	return 0;
	}

	int gen6_emit_flush_rcs(struct i915_request *rq, u32 mode)
	{
	u32 scratch_addr =
	intel_gt_scratch_offset(rq->engine->gt,
	INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
	u32 *cs, flags = 0;
	int ret;

	/* Force SNB workarounds for PIPE_CONTROL flushes */
	ret = gen6_emit_post_sync_nonzero_flush(rq);
	if (ret)
	return ret;

	/*
	* Just flush everything. Experiments have shown that reducing the
	* number of bits based on the write domains has little performance
	* impact. And when rearranging requests, the order of flushes is
	* unknown.
	*/
	if (mode & EMIT_FLUSH) {
	flags \|= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
	flags \|= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
	/*
	* Ensure that any following seqno writes only happen
	* when the render cache is indeed flushed.
	*/
	flags \|= PIPE_CONTROL_CS_STALL;
	}
	if (mode & EMIT_INVALIDATE) {
	flags \|= PIPE_CONTROL_TLB_INVALIDATE;
	flags \|= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
	flags \|= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
	flags \|= PIPE_CONTROL_VF_CACHE_INVALIDATE;
	flags \|= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
	flags \|= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
	/*
	* TLB invalidate requires a post-sync write.
	*/
	flags \|= PIPE_CONTROL_QW_WRITE \| PIPE_CONTROL_CS_STALL;
	}

	cs = intel_ring_begin(rq, 4);
	if (IS_ERR(cs))
	return PTR_ERR(cs);

	*cs++ = GFX_OP_PIPE_CONTROL(4);
	*cs++ = flags;
	*cs++ = scratch_addr \| PIPE_CONTROL_GLOBAL_GTT;
	*cs++ = 0;
	intel_ring_advance(rq, cs);

	return 0;
	}

	u32 gen6_emit_breadcrumb_rcs(struct i915_request rq, u32 *cs)
	{
	/* First we do the gen6_emit_post_sync_nonzero_flush w/a */
	*cs++ = GFX_OP_PIPE_CONTROL(4);
	*cs++ = PIPE_CONTROL_CS_STALL \| PIPE_CONTROL_STALL_AT_SCOREBOARD;
	*cs++ = 0;
	*cs++ = 0;

	*cs++ = GFX_OP_PIPE_CONTROL(4);
	*cs++ = PIPE_CONTROL_QW_WRITE;
	*cs++ = intel_gt_scratch_offset(rq->engine->gt,
	INTEL_GT_SCRATCH_FIELD_DEFAULT) \|
	PIPE_CONTROL_GLOBAL_GTT;
	*cs++ = 0;

	/* Finally we can flush and with it emit the breadcrumb */
	*cs++ = GFX_OP_PIPE_CONTROL(4);
	*cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH \|
	PIPE_CONTROL_DEPTH_CACHE_FLUSH \|
	PIPE_CONTROL_DC_FLUSH_ENABLE \|
	PIPE_CONTROL_QW_WRITE \|
	PIPE_CONTROL_CS_STALL);
	*cs++ = i915_request_active_seqno(rq) \|
	PIPE_CONTROL_GLOBAL_GTT;
	*cs++ = rq->fence.seqno;

	*cs++ = MI_USER_INTERRUPT;
	*cs++ = MI_NOOP;

	rq->tail = intel_ring_offset(rq, cs);
	assert_ring_tail_valid(rq->ring, rq->tail);

	return cs;
	}

	static int mi_flush_dw(struct i915_request *rq, u32 flags)
	{
	u32 cmd, *cs;

	cs = intel_ring_begin(rq, 4);
	if (IS_ERR(cs))
	return PTR_ERR(cs);

	cmd = MI_FLUSH_DW;

	/*
	* We always require a command barrier so that subsequent
	* commands, such as breadcrumb interrupts, are strictly ordered
	* wrt the contents of the write cache being flushed to memory
	* (and thus being coherent from the CPU).
	*/
	cmd \|= MI_FLUSH_DW_STORE_INDEX \| MI_FLUSH_DW_OP_STOREDW;

	/*
	* Bspec vol 1c.3 - blitter engine command streamer:
	* "If ENABLED, all TLBs will be invalidated once the flush
	* operation is complete. This bit is only valid when the
	* Post-Sync Operation field is a value of 1h or 3h."
	*/
	cmd \|= flags;

	*cs++ = cmd;
	*cs++ = HWS_SCRATCH_ADDR \| MI_FLUSH_DW_USE_GTT;
	*cs++ = 0;
	*cs++ = MI_NOOP;

	intel_ring_advance(rq, cs);

	return 0;
	}

	static int gen6_flush_dw(struct i915_request *rq, u32 mode, u32 invflags)
	{
	return mi_flush_dw(rq, mode & EMIT_INVALIDATE ? invflags : 0);
	}

	int gen6_emit_flush_xcs(struct i915_request *rq, u32 mode)
	{
	return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB);
	}

	int gen6_emit_flush_vcs(struct i915_request *rq, u32 mode)
	{
	return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB \| MI_INVALIDATE_BSD);
	}

	int gen6_emit_bb_start(struct i915_request *rq,
	u64 offset, u32 len,
	unsigned int dispatch_flags)
	{
	u32 security;
	u32 *cs;

	security = MI_BATCH_NON_SECURE_I965;
	if (dispatch_flags & I915_DISPATCH_SECURE)
	security = 0;

	cs = intel_ring_begin(rq, 2);
	if (IS_ERR(cs))
	return PTR_ERR(cs);

	cs = __gen6_emit_bb_start(cs, offset, security);
	intel_ring_advance(rq, cs);

	return 0;
	}

	int
	hsw_emit_bb_start(struct i915_request *rq,
	u64 offset, u32 len,
	unsigned int dispatch_flags)
	{
	u32 security;
	u32 *cs;

	security = MI_BATCH_PPGTT_HSW \| MI_BATCH_NON_SECURE_HSW;
	if (dispatch_flags & I915_DISPATCH_SECURE)
	security = 0;

	cs = intel_ring_begin(rq, 2);
	if (IS_ERR(cs))
	return PTR_ERR(cs);

	cs = __gen6_emit_bb_start(cs, offset, security);
	intel_ring_advance(rq, cs);

	return 0;
	}

	static int gen7_stall_cs(struct i915_request *rq)
	{
	u32 *cs;

	cs = intel_ring_begin(rq, 4);
	if (IS_ERR(cs))
	return PTR_ERR(cs);

	*cs++ = GFX_OP_PIPE_CONTROL(4);
	*cs++ = PIPE_CONTROL_CS_STALL \| PIPE_CONTROL_STALL_AT_SCOREBOARD;
	*cs++ = 0;
	*cs++ = 0;
	intel_ring_advance(rq, cs);

	return 0;
	}

	int gen7_emit_flush_rcs(struct i915_request *rq, u32 mode)
	{
	u32 scratch_addr =
	intel_gt_scratch_offset(rq->engine->gt,
	INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
	u32 *cs, flags = 0;

	/*
	* Ensure that any following seqno writes only happen when the render
	* cache is indeed flushed.
	*
	* Workaround: 4th PIPE_CONTROL command (except the ones with only
	* read-cache invalidate bits set) must have the CS_STALL bit set. We
	* don't try to be clever and just set it unconditionally.
	*/
	flags \|= PIPE_CONTROL_CS_STALL;

	/*
	* CS_STALL suggests at least a post-sync write.
	*/
	flags \|= PIPE_CONTROL_QW_WRITE;
	flags \|= PIPE_CONTROL_GLOBAL_GTT_IVB;

	/*
	* Just flush everything. Experiments have shown that reducing the
	* number of bits based on the write domains has little performance
	* impact.
	*/
	if (mode & EMIT_FLUSH) {
	flags \|= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
	flags \|= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
	flags \|= PIPE_CONTROL_DC_FLUSH_ENABLE;
	flags \|= PIPE_CONTROL_FLUSH_ENABLE;
	}
	if (mode & EMIT_INVALIDATE) {
	flags \|= PIPE_CONTROL_TLB_INVALIDATE;
	flags \|= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
	flags \|= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
	flags \|= PIPE_CONTROL_VF_CACHE_INVALIDATE;
	flags \|= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
	flags \|= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
	flags \|= PIPE_CONTROL_MEDIA_STATE_CLEAR;

	/*
	* Workaround: we must issue a pipe_control with CS-stall bit
	* set before a pipe_control command that has the state cache
	* invalidate bit set.
	*/
	gen7_stall_cs(rq);
	}

	cs = intel_ring_begin(rq, 4);
	if (IS_ERR(cs))
	return PTR_ERR(cs);

	*cs++ = GFX_OP_PIPE_CONTROL(4);
	*cs++ = flags;
	*cs++ = scratch_addr;
	*cs++ = 0;
	intel_ring_advance(rq, cs);

	return 0;
	}

	u32 gen7_emit_breadcrumb_rcs(struct i915_request rq, u32 *cs)
	{
	*cs++ = GFX_OP_PIPE_CONTROL(4);
	*cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH \|
	PIPE_CONTROL_DEPTH_CACHE_FLUSH \|
	PIPE_CONTROL_DC_FLUSH_ENABLE \|
	PIPE_CONTROL_FLUSH_ENABLE \|
	PIPE_CONTROL_QW_WRITE \|
	PIPE_CONTROL_GLOBAL_GTT_IVB \|
	PIPE_CONTROL_CS_STALL);
	*cs++ = i915_request_active_seqno(rq);
	*cs++ = rq->fence.seqno;

	*cs++ = MI_USER_INTERRUPT;
	*cs++ = MI_NOOP;

	rq->tail = intel_ring_offset(rq, cs);
	assert_ring_tail_valid(rq->ring, rq->tail);

	return cs;
	}

	u32 gen6_emit_breadcrumb_xcs(struct i915_request rq, u32 *cs)
	{
	GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
	GEM_BUG_ON(offset_in_page(rq->hwsp_seqno) != I915_GEM_HWS_SEQNO_ADDR);

	*cs++ = MI_FLUSH_DW \| MI_FLUSH_DW_OP_STOREDW \| MI_FLUSH_DW_STORE_INDEX;
	*cs++ = I915_GEM_HWS_SEQNO_ADDR \| MI_FLUSH_DW_USE_GTT;
	*cs++ = rq->fence.seqno;

	*cs++ = MI_USER_INTERRUPT;

	rq->tail = intel_ring_offset(rq, cs);
	assert_ring_tail_valid(rq->ring, rq->tail);

	return cs;
	}

	#define GEN7_XCS_WA 32
	u32 gen7_emit_breadcrumb_xcs(struct i915_request rq, u32 *cs)
	{
	int i;

	GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
	GEM_BUG_ON(offset_in_page(rq->hwsp_seqno) != I915_GEM_HWS_SEQNO_ADDR);

	*cs++ = MI_FLUSH_DW \| MI_INVALIDATE_TLB \|
	MI_FLUSH_DW_OP_STOREDW \| MI_FLUSH_DW_STORE_INDEX;
	*cs++ = I915_GEM_HWS_SEQNO_ADDR \| MI_FLUSH_DW_USE_GTT;
	*cs++ = rq->fence.seqno;

	for (i = 0; i < GEN7_XCS_WA; i++) {
	*cs++ = MI_STORE_DWORD_INDEX;
	*cs++ = I915_GEM_HWS_SEQNO_ADDR;
	*cs++ = rq->fence.seqno;
	}

	*cs++ = MI_FLUSH_DW;
	*cs++ = 0;
	*cs++ = 0;

	*cs++ = MI_USER_INTERRUPT;
	*cs++ = MI_NOOP;

	rq->tail = intel_ring_offset(rq, cs);
	assert_ring_tail_valid(rq->ring, rq->tail);

	return cs;
	}
	#undef GEN7_XCS_WA

	void gen6_irq_enable(struct intel_engine_cs *engine)
	{
	ENGINE_WRITE(engine, RING_IMR,
	~(engine->irq_enable_mask \| engine->irq_keep_mask));

	/* Flush/delay to ensure the RING_IMR is active before the GT IMR */
	ENGINE_POSTING_READ(engine, RING_IMR);

	gen5_gt_enable_irq(engine->gt, engine->irq_enable_mask);
	}

	void gen6_irq_disable(struct intel_engine_cs *engine)
	{
	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
	gen5_gt_disable_irq(engine->gt, engine->irq_enable_mask);
	}

	void hsw_irq_enable_vecs(struct intel_engine_cs *engine)
	{
	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_enable_mask);

	/* Flush/delay to ensure the RING_IMR is active before the GT IMR */
	ENGINE_POSTING_READ(engine, RING_IMR);

	gen6_gt_pm_unmask_irq(engine->gt, engine->irq_enable_mask);
	}

	void hsw_irq_disable_vecs(struct intel_engine_cs *engine)
	{
	ENGINE_WRITE(engine, RING_IMR, ~0);
	gen6_gt_pm_mask_irq(engine->gt, engine->irq_enable_mask);
	}