From 96ef36fd2c2544b7cc5b6c942247f52a4d450f99 Mon Sep 17 00:00:00 2001
From: sisu
Date: Sun, 1 Dec 2024 18:18:49 +0200
Subject: [PATCH] Add hack4 (ai1337)
---
docs/specs/hxp_ai1337.rst | 192 ++++++++++++++++++++++++++
target/i386/cpu.c | 52 +++++++
target/i386/cpu.h | 17 +++
target/i386/ops_ai1337.h | 8 ++
target/i386/tcg/decode-new.c.inc | 17 +++
target/i386/tcg/emit.c.inc | 47 +++++++
target/i386/tcg/sysemu/excp_helper.c | 12 ++
target/i386/tcg/sysemu/misc_helper.c | 40 ++++++
target/i386/tcg/translate.c | 196 +++++++++++++++++++++++++++
9 files changed, 581 insertions(+)
create mode 100644 docs/specs/hxp_ai1337.rst
create mode 100644 target/i386/ops_ai1337.h
diff --git a/docs/specs/hxp_ai1337.rst b/docs/specs/hxp_ai1337.rst
new file mode 100644
index 000000000..95b6d4280
--- /dev/null
+++ b/docs/specs/hxp_ai1337.rst
@@ -0,0 +1,192 @@
+HXP HACK-4 AI1337 Device Specification
+======================================
+
+The HXP HACK-4 AI1337 is designed to fulfil the compute needs
+of the AI industry. The design is a significant extension to
+the existing X86 architecture to enable fast scratch operations.
+
+High-Level Architecture
+=======================
+
+This section provides a high-level overview of the HXP HACK-4 and
+AI1337 architecture.
+
+Processor Organization
+----------------------
+
+::
+
+ HXP HACK-4 application processor, optimized for scalar compute
+ AI1337 Engine, optimized for very-wide compute
+
+ |------------|---------|-----------|
+ | |---------| |
+ | HXP HACK-4 |---------| AI1337 IP |
+ | |---------| |
+ |------------|---------|-----------|
+ |
+ |
+ PSCHORR interconnect
+
+ PSCHORR very-wide link
+
+The HXP HACK-4 is the application processor responsible for boot and
+executing OS software. The AI1337 execution engine is on-die engine
+responsible for fast scratch operations.
+
+AI1337 Engine Organization
+--------------------------
+
+::
+
+ --------------------------------------------
+ | AI1337 engine | Execution Interconnect
+ | | |
+ |------------------------------------------| |
+ | Slice 0 |-----| --------------------
+ |------------------------------------------| |---| Multi-ALU engine |
+ | Slice 1 |-----| --------------------
+ |------------------------------------------| |
+ | Slice 2 |-----| --------------------
+ |------------------------------------------| |---| Multi-ALU engine |
+ | ... |-----| --------------------
+ |------------------------------------------| |
+ | Slice N |-----| --------------------
+ |------------------------------------------| |---| Multi-ALU engine |
+ --------------------
+
+The AI1337 engine is organized as a vector of interconnected memory slices.
+Slices are interconnected via the 'execution interconnect' in an N-to-N
+fashion, and each cross-slice wide-link is connected to a series of
+multi-ALU engines that support fast addition, subtraction and multiplication.
+
+PSCHORR Interconnect
+--------------------
+
+The PSCHORR Interconnect connects the HACK-4 application processor
+and the AI1337 Engine using a multi-link organization for fast
+slice reads and writes.
+
+The interconnect allows also for addressability of the scratch memory
+through an bi-ATS unit that supports bi-directional addressing of scratch
+and application processor memory.
+
+::
+
+ Physical Memory Virtual Memory
+ 0 |
+ | |
+ IO space | |
+ | |
+ - |
+ | |
+ | |
+ | | Direct Addressing
+ RAM | | |
+ | ___________________________|_____ |
+ | / | | |
+ ---/ | bi-ATS |------|
+ | | |
+ | _______________________|________|
+ AI1337 | /
+ aperture | / PSCHORR Interconnect
+ ---
+
+ISA Contributions
+=================
+
+This section describes the ISA contributions to the X86_64 ISA.
+The added instructions are responsible for updating scratch memory
+on the AI1337 engine and for submitting work to the AI1337 engine.
+The ISA also includes instructions for fast reconfiguration of the
+PSCHORR interconnect.
+
+
+.. list-table:: ISA
+ :widths: 25 25 50
+ :header-rows: 1
+
+ * - Opcode
+ - Instruction
+ - Description
+ * - 0F 0A 83
+ - MTS
+ - Load RCX bytes from memory address (RSI) to slice (RBX) at slice offset (RDI)
+ * - 0F 0A 84
+ - STM
+ - Read RCX bytes from slice (RBX) at slice offset (RDI) and write memory address (RSI)
+ * - 0F 0A 85
+ - FSCR
+ - Clear all slices
+ * - 0F 0A 86
+ - SCRADD
+ - Add the slices pointed by RDI and RSI, and store the result into slice pointed by RDX
+ * - 0F 0A 87
+ - SCRSUB
+ - Subtract the slices pointed by RDI and RSI, and store the result into slice pointed by RDX
+ * - 0F 0A 88
+ - SCRMUL
+ - Multiply the slices pointed by RDI and RSI, and store the result into slice pointed by RDX
+ * - 0F 0A 89
+ - SCRHLW (privileged)
+ - Update scratch memory PSCHORR bi-ATS base VA
+ * - 0F 0A 8A
+ - SCRHLR
+ - Read scratch memory PSCHORR bi-ATS base VA
+
+System-Level Contributions
+==========================
+
+This section provides information on system-level specification and configuration,
+and it's primarily targeted towards kernel developers.
+
+Specification
+-------------
+
+The AI1337 engine support is dictated by the existence of the 0x80000022 CPUID leaf.
+If the AI1337 CPUID leaf exists, the EAX, ECX, EDX and EBX registers provide the following information:
+
+.. list-table:: CPUID 0x80000022
+ :widths: 25 25 50
+ :header-rows: 1
+
+ * - Register
+ - Bits
+ - Information
+ * - EAX
+ - 0-31
+ - Total scratch memory size
+ * - ECX
+ - 0-9
+ - Maximum number of slices
+ * - ECX
+ - 10-31
+ - Maximum slice size in bytes
+ * - EDX
+ - 0-31
+ - Low 32 bits of the AI1337 Aperture
+ * - EBX
+ - 0-31
+ - High 32 bits of the AI1337 Aperture
+
+Configuration
+-------------
+
+The AI1337 engine is a multi-configurable engine that software can
+utilize for scaling up for high-computing workloads and scaling
+down for power-efficiency.
+
+.. list-table:: MSR
+ :widths: 40 25 50
+ :header-rows: 1
+
+ * - MSR
+ - Identifier
+ - Description
+ * - MSR_HACK4_SLICE_SIZE
+ - 0xC0000105
+ - Read/Write slice size in the AI1337 engine
+ * - MSR_HACK4_NUM_SLICES
+ - 0xC0000106
+ - Read/Write count of slices in the AI1337 engine
+
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 85ef7452c..197a813f7 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -43,9 +43,13 @@
#include "hw/i386/sgx-epc.h"
#endif
+#include "exec/ramblock.h"
+
#include "disas/capstone.h"
#include "cpu-internal.h"
+#include "ops_ai1337.h"
+
static void x86_cpu_realizefn(DeviceState *dev, Error **errp);
/* Helpers for building CPUID[2] descriptors: */
@@ -5256,6 +5260,26 @@ static const X86CPUDefinition builtin_x86_defs[] = {
.model_id = "AMD EPYC-Genoa Processor",
.cache_info = &epyc_genoa_cache_info,
},
+ {
+ .name = "hxp-ai1337",
+ .level = 0xd,
+ .vendor = CPUID_VENDOR_AMD,
+ .family = 25,
+ .model = 1,
+ .stepping = 1,
+ .features[FEAT_1_EDX] =
+ PPRO_FEATURES |
+ CPUID_MTRR | CPUID_CLFLUSH | CPUID_MCA |
+ CPUID_PSE36,
+ .features[FEAT_1_ECX] =
+ CPUID_EXT_SSE3 | CPUID_EXT_CX16 | CPUID_EXT_RDRAND,
+ .features[FEAT_8000_0001_EDX] =
+ CPUID_EXT2_LM | CPUID_EXT2_SYSCALL | CPUID_EXT2_NX,
+ .features[FEAT_8000_0001_ECX] =
+ CPUID_EXT3_LAHF_LM | CPUID_EXT3_SVM,
+ .xlevel = 0x80000022,
+ .model_id = "HXP Silicon Foundaries AI 1337 Processor",
+ },
};
/*
@@ -5688,6 +5712,11 @@ static inline void feat2prop(char *s)
}
}
+uint64_t x86_calculate_scratch_size(CPUX86State* env)
+{
+ return (env->scratch_config.slice_size * env->scratch_config.num_active_slices);
+}
+
/* Return the feature property name for a feature flag bit */
static const char *x86_cpu_feature_name(FeatureWord w, int bitnr)
{
@@ -7044,6 +7073,13 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count,
*eax = env->features[FEAT_8000_0021_EAX];
*ebx = *ecx = *edx = 0;
break;
+ case 0x80000022:
+ *eax = *ebx = *ecx = *edx = 0;
+ *ecx = (AI1337_SCRATCH_MAX_SLICE_SIZE << 10) | AI1337_SCRATCH_MAX_NUM_SLICES;
+ *eax = AI1337_SCRATCH_SIZE;
+ *edx = (AI1337_SCRATCH_PHYS_BASE & 0xFFFFFFFFU);
+ *ebx = ((AI1337_SCRATCH_PHYS_BASE >> 32U) & 0xFFFFFFFFU);
+ break;
default:
/* reserved values: zero */
*eax = 0;
@@ -8052,6 +8088,22 @@ static void x86_cpu_initfn(Object *obj)
if (xcc->model) {
x86_cpu_load_model(cpu, xcc->model);
}
+
+ {
+ env->scratch_config.num_active_slices = AI1337_SCRATCH_NUM_SLICES_DEFAULT;
+ env->scratch_config.slice_size = AI1337_SCRATCH_SLICE_SIZE_DEFAULT;
+ env->scratch_config.va_base = AI1337_SCRATCH_VA_BASE;
+ env->scratch_config.phys_base = AI1337_SCRATCH_PHYS_BASE;
+ env->scratch_config.access_enabled = 0;
+
+ uint16_t scratch[AI1337_SCRATCH_SIZE];
+ env->scratch_region = malloc(sizeof(MemoryRegion));
+ memset(env->scratch_region, 0, sizeof(*env->scratch_region));
+ memory_region_init_ram_ptr(env->scratch_region, NULL, "ai1337-scratch", AI1337_SCRATCH_SIZE, scratch);
+ env->scratch_region->ram_block->flags |= RAM_RESIZEABLE;
+ env->scratch_region->ram_block->max_length = AI1337_SCRATCH_MAX_NUM_SLICES * AI1337_SCRATCH_MAX_SLICE_SIZE;
+ memory_region_add_subregion(get_system_memory(), AI1337_SCRATCH_PHYS_BASE, env->scratch_region);
+ }
}
static int64_t x86_cpu_get_arch_id(CPUState *cs)
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 14edd57a3..778c9a730 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -544,6 +544,9 @@ typedef enum X86Seg {
#define MSR_IA32_XFD 0x000001c4
#define MSR_IA32_XFD_ERR 0x000001c5
+#define MSR_HACK4_SLICE_SIZE 0xc0000105
+#define MSR_HACK4_NUM_SLICES 0xc0000106
+
/* FRED MSRs */
#define MSR_IA32_FRED_RSP0 0x000001cc /* Stack level 0 regular stack pointer */
#define MSR_IA32_FRED_RSP1 0x000001cd /* Stack level 1 regular stack pointer */
@@ -1681,6 +1684,14 @@ typedef struct HVFX86LazyFlags {
target_ulong auxbits;
} HVFX86LazyFlags;
+typedef struct ScratchConfig {
+ uint64_t va_base;
+ uint64_t phys_base;
+ size_t num_active_slices;
+ size_t slice_size;
+ int access_enabled;
+} ScratchConfig;
+
typedef struct CPUArchState {
/* standard registers */
target_ulong regs[CPU_NB_REGS];
@@ -1996,6 +2007,10 @@ typedef struct CPUArchState {
/* Bitmap of available CPU topology levels for this CPU. */
DECLARE_BITMAP(avail_cpu_topo, CPU_TOPO_LEVEL_MAX);
+
+ MemoryRegion *scratch_region;
+ ScratchConfig scratch_config;
+
} CPUX86State;
struct kvm_msrs;
@@ -2639,6 +2654,8 @@ void x86_cpu_xsave_all_areas(X86CPU *cpu, void *buf, uint32_t buflen);
uint32_t xsave_area_size(uint64_t mask, bool compacted);
void x86_update_hflags(CPUX86State* env);
+uint64_t x86_calculate_scratch_size(CPUX86State* env);
+
static inline bool hyperv_feat_enabled(X86CPU *cpu, int feat)
{
return !!(cpu->hyperv_features & BIT(feat));
diff --git a/target/i386/ops_ai1337.h b/target/i386/ops_ai1337.h
new file mode 100644
index 000000000..7aea6ae78
--- /dev/null
+++ b/target/i386/ops_ai1337.h
@@ -0,0 +1,8 @@
+
+#define AI1337_SCRATCH_VA_BASE 0xFFFFFFFFFFA00000ULL
+#define AI1337_SCRATCH_PHYS_BASE 0xFFFFFFFFFFF00000ULL
+#define AI1337_SCRATCH_SIZE (33ULL * 1024)
+#define AI1337_SCRATCH_MAX_NUM_SLICES (128)
+#define AI1337_SCRATCH_SLICE_SIZE_DEFAULT (1024ULL)
+#define AI1337_SCRATCH_NUM_SLICES_DEFAULT (33UL)
+#define AI1337_SCRATCH_MAX_SLICE_SIZE (4096ULL)
diff --git a/target/i386/tcg/decode-new.c.inc b/target/i386/tcg/decode-new.c.inc
index 30be9237c..968042464 100644
--- a/target/i386/tcg/decode-new.c.inc
+++ b/target/i386/tcg/decode-new.c.inc
@@ -1032,6 +1032,21 @@ static void decode_0F5A(DisasContext *s, CPUX86State *env, X86OpEntry *entry, ui
*entry = *decode_by_prefix(s, opcodes_0F5A);
}
+static void decode_0F0A(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+ uint8_t c = x86_ldub_code(env, s);
+ switch (c) {
+ case 0x83: entry->gen = gen_MTS; break;
+ case 0x84: entry->gen = gen_STM; break;
+ case 0x85: entry->gen = gen_FSCR; break;
+ case 0x86: entry->gen = gen_SCRADD; break;
+ case 0x87: entry->gen = gen_SCRSUB; break;
+ case 0x88: entry->gen = gen_SCRMUL; break;
+ case 0x89: entry->gen = gen_SCRHLW; break;
+ case 0x8a: entry->gen = gen_SCRHLR; break;
+ }
+}
+
static void decode_0F5B(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
{
static const X86OpEntry opcodes_0F5B[4] = {
@@ -1273,6 +1288,8 @@ static const X86OpEntry opcodes_0F[256] = {
[0x7e] = X86_OP_GROUP0(0F7E),
[0x7f] = X86_OP_GROUP0(0F7F),
+ [0x0a] = X86_OP_GROUP0(0F0A),
+
[0x88] = X86_OP_ENTRYr(Jcc, J,z_f64),
[0x89] = X86_OP_ENTRYr(Jcc, J,z_f64),
[0x8a] = X86_OP_ENTRYr(Jcc, J,z_f64),
diff --git a/target/i386/tcg/emit.c.inc b/target/i386/tcg/emit.c.inc
index 9b5041991..9a2e57b8f 100644
--- a/target/i386/tcg/emit.c.inc
+++ b/target/i386/tcg/emit.c.inc
@@ -3853,6 +3853,53 @@ static void gen_SUB(DisasContext *s, X86DecodedInsn *decode)
prepare_update2_cc(decode, s, CC_OP_SUBB + ot);
}
+static void gen_MTS(DisasContext *s, X86DecodedInsn *decode)
+{
+ gen_repz(s, MO_8, gen_mts_8);
+}
+
+static void gen_FSCR(DisasContext *s, X86DecodedInsn *decode)
+{
+ gen_fscr(s);
+}
+
+static void gen_SCRHLW(DisasContext *s, X86DecodedInsn *decode)
+{
+ if (CPL(s) != 0)
+ {
+ gen_illegal_opcode(s);
+ return;
+ }
+ size_t va_base_offset = offsetof(CPUX86State, scratch_config.va_base);
+ tcg_gen_st_tl(cpu_regs[R_EDI], tcg_env, va_base_offset);
+}
+
+static void gen_SCRHLR(DisasContext *s, X86DecodedInsn *decode)
+{
+ size_t va_base_offset = offsetof(CPUX86State, scratch_config.va_base);
+ tcg_gen_ld_tl(cpu_regs[R_EAX], tcg_env, va_base_offset);
+}
+
+static void gen_STM(DisasContext *s, X86DecodedInsn *decode)
+{
+ gen_repz(s, MO_8, gen_stm_8);
+}
+
+static void gen_SCRADD(DisasContext *s, X86DecodedInsn *decode)
+{
+ gen_slice_op(s, SLICE_OP_TYPE_ADD);
+}
+
+static void gen_SCRSUB(DisasContext *s, X86DecodedInsn *decode)
+{
+ gen_slice_op(s, SLICE_OP_TYPE_SUB);
+}
+
+static void gen_SCRMUL(DisasContext *s, X86DecodedInsn *decode)
+{
+ gen_slice_op(s, SLICE_OP_TYPE_MUL);
+}
+
static void gen_SYSCALL(DisasContext *s, X86DecodedInsn *decode)
{
gen_update_cc_op(s);
diff --git a/target/i386/tcg/sysemu/excp_helper.c b/target/i386/tcg/sysemu/excp_helper.c
index 8fb05b1f5..f524f97c2 100644
--- a/target/i386/tcg/sysemu/excp_helper.c
+++ b/target/i386/tcg/sysemu/excp_helper.c
@@ -23,6 +23,7 @@
#include "exec/exec-all.h"
#include "exec/page-protection.h"
#include "tcg/helper-tcg.h"
+#include "../../ops_ai1337.h"
typedef struct TranslateParams {
target_ulong addr;
@@ -600,6 +601,17 @@ bool x86_cpu_tlb_fill(CPUState *cs, vaddr addr, int size,
TranslateResult out;
TranslateFault err;
+ if (env->scratch_config.access_enabled &&
+ (addr >= env->scratch_config.va_base) &&
+ ((addr + size) <= (env->scratch_config.va_base + x86_calculate_scratch_size(env)))) {
+ vaddr paddr = env->scratch_config.phys_base + (addr - env->scratch_config.va_base);
+ tlb_set_page_with_attrs(cs, addr & TARGET_PAGE_MASK,
+ paddr & TARGET_PAGE_MASK,
+ cpu_get_mem_attrs(env),
+ PAGE_READ | PAGE_WRITE | PAGE_EXEC, mmu_idx, TARGET_PAGE_SIZE);
+ return true;
+ }
+
if (get_physical_address(env, addr, access_type, mmu_idx, &out, &err,
retaddr)) {
/*
diff --git a/target/i386/tcg/sysemu/misc_helper.c b/target/i386/tcg/sysemu/misc_helper.c
index 094aa56a2..78fd3a573 100644
--- a/target/i386/tcg/sysemu/misc_helper.c
+++ b/target/i386/tcg/sysemu/misc_helper.c
@@ -26,6 +26,7 @@
#include "exec/exec-all.h"
#include "tcg/helper-tcg.h"
#include "hw/i386/apic.h"
+#include "../../ops_ai1337.h"
void helper_outb(CPUX86State *env, uint32_t port, uint32_t data)
{
@@ -128,6 +129,27 @@ void helper_write_crN(CPUX86State *env, int reg, target_ulong t0)
}
}
+static bool helper_recalculate_scratch(CPUX86State *env, uint32_t new_num_slices, uint32_t new_slice_size)
+{
+ if (new_num_slices > AI1337_SCRATCH_MAX_NUM_SLICES) {
+ return false;
+ }
+ if (new_slice_size > AI1337_SCRATCH_MAX_SLICE_SIZE) {
+ return false;
+ }
+ uint32_t new_size = new_num_slices * new_slice_size;
+ Error *err = NULL;
+ bql_lock();
+ memory_region_ram_resize(env->scratch_region, new_size, &err);
+ bql_unlock();
+ if (err) {
+ return false;
+ }
+ env->scratch_config.num_active_slices = new_num_slices;
+ env->scratch_config.slice_size = new_slice_size;
+ return true;
+}
+
void helper_wrmsr(CPUX86State *env)
{
uint64_t val;
@@ -306,6 +328,18 @@ void helper_wrmsr(CPUX86State *env)
break;
}
+ case MSR_HACK4_SLICE_SIZE:
+ const uint32_t new_slice_size = val;
+ if (!helper_recalculate_scratch(env, env->scratch_config.num_active_slices, new_slice_size)) {
+ goto error;
+ }
+ break;
+ case MSR_HACK4_NUM_SLICES:
+ const uint32_t new_num_active_slices = val;
+ if (!helper_recalculate_scratch(env, new_num_active_slices, env->scratch_config.slice_size)) {
+ goto error;
+ }
+ break;
default:
if ((uint32_t)env->regs[R_ECX] >= MSR_MC0_CTL
&& (uint32_t)env->regs[R_ECX] < MSR_MC0_CTL +
@@ -333,6 +367,12 @@ void helper_rdmsr(CPUX86State *env)
cpu_svm_check_intercept_param(env, SVM_EXIT_MSR, 0, GETPC());
switch ((uint32_t)env->regs[R_ECX]) {
+ case MSR_HACK4_SLICE_SIZE:
+ val = env->scratch_config.slice_size;
+ break;
+ case MSR_HACK4_NUM_SLICES:
+ val = env->scratch_config.num_active_slices;
+ break;
case MSR_IA32_SYSENTER_CS:
val = env->sysenter_cs;
break;
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index 98f5fe61e..0fd28c60f 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -21,6 +21,7 @@
#include "qemu/host-utils.h"
#include "cpu.h"
#include "exec/exec-all.h"
+#include "tcg/tcg-op-common.h"
#include "tcg/tcg-op.h"
#include "tcg/tcg-op-gvec.h"
#include "exec/translator.h"
@@ -32,6 +33,8 @@
#include "exec/log.h"
+#include "ops_ai1337.h"
+
#define HELPER_H "helper.h"
#include "exec/helper-info.c.inc"
#undef HELPER_H
@@ -1198,6 +1201,199 @@ static void gen_stos(DisasContext *s, MemOp ot)
gen_op_add_reg(s, s->aflag, R_EDI, gen_compute_Dshift(s, ot));
}
+static void gen_fscr(DisasContext *s)
+{
+ TCGLabel *l1 = gen_new_label();
+ TCGLabel *l2 = gen_new_label();
+
+ const size_t slice_size_offset = offsetof(CPUX86State, scratch_config.slice_size);
+ const size_t slice_count_offset = offsetof(CPUX86State, scratch_config.num_active_slices);
+ const size_t va_base_offset = offsetof(CPUX86State, scratch_config.va_base);
+ const size_t access_offset = offsetof(CPUX86State, scratch_config.access_enabled);
+
+ tcg_gen_st_tl(tcg_constant_i64(1), tcg_env, access_offset);
+
+ // Calculate size
+ tcg_gen_ld32u_tl(s->tmp0, tcg_env, slice_size_offset);
+ tcg_gen_ld32u_tl(s->tmp4, tcg_env, slice_count_offset);
+ tcg_gen_mul_tl(s->tmp0, s->tmp0, s->tmp4);
+
+ // For loop to clear memory
+ gen_set_label(l1);
+ gen_update_cc_op(s);
+ TCGv tmp = gen_ext_tl(NULL, s->tmp0, s->aflag, false);
+ tcg_gen_brcondi_tl(TCG_COND_EQ, tmp, 0, l2);
+ tcg_gen_sub_tl(s->tmp0, s->tmp0, tcg_constant_i64(1));
+ tcg_gen_ld_tl(s->A0, tcg_env, va_base_offset);
+ gen_lea_v_seg(s, s->A0, R_ES, -1);
+ tcg_gen_add_tl(s->A0, s->A0, s->tmp0);
+ gen_op_st_v(s, MO_8, tcg_constant_i64(0), s->A0);
+ tmp = gen_ext_tl(NULL, s->tmp0, s->aflag, false);
+ tcg_gen_brcondi_tl(TCG_COND_NE, tmp, 0, l1);
+ gen_set_label(l2);
+
+ tcg_gen_st_tl(tcg_constant_i64(0), tcg_env, access_offset);
+}
+
+typedef enum SLICE_OP_TYPE {
+ SLICE_OP_TYPE_ADD,
+ SLICE_OP_TYPE_SUB,
+ SLICE_OP_TYPE_MUL,
+} SLICE_OP_TYPE;
+
+static void gen_illegal_opcode(DisasContext *s);
+
+static void gen_slice_op(DisasContext *s, SLICE_OP_TYPE op_type)
+{
+ TCGLabel *l1 = gen_new_label();
+ TCGLabel *l2 = gen_new_label();
+
+ const size_t slice_size_offset = offsetof(CPUX86State, scratch_config.slice_size);
+ const size_t va_base_offset = offsetof(CPUX86State, scratch_config.va_base);
+ const size_t access_offset = offsetof(CPUX86State, scratch_config.access_enabled);
+
+ const TCGv slice_a = cpu_regs[R_EDI];
+ const TCGv slice_b = cpu_regs[R_ESI];
+ const TCGv slice_c = cpu_regs[R_EDX];
+
+ tcg_gen_st_tl(tcg_constant_i64(1), tcg_env, access_offset);
+
+ // slice size
+ tcg_gen_ld32u_tl(s->tmp0, tcg_env, slice_size_offset);
+
+ // tmp4 always holds the const slice size
+ tcg_gen_mov_tl(s->tmp4, s->tmp0);
+
+ // For loop to clear memory
+ gen_set_label(l1);
+ gen_update_cc_op(s);
+ TCGv tmp = gen_ext_tl(NULL, s->tmp0, s->aflag, false);
+ tcg_gen_brcondi_tl(TCG_COND_EQ, tmp, 0, l2);
+
+ // slice_size -= 8
+ tcg_gen_sub_tl(s->tmp0, s->tmp0, tcg_constant_i64(8));
+
+ // load slice_a value into T1
+ // A0, T1 initialized
+ tcg_gen_ld_tl(s->A0, tcg_env, va_base_offset);
+ gen_lea_v_seg(s, s->A0, R_ES, -1);
+ tcg_gen_mul_tl(s->T1, slice_a, s->tmp4);
+ tcg_gen_add_tl(s->A0, s->A0, s->T1);
+ tcg_gen_add_tl(s->A0, s->A0, s->tmp0);
+ gen_op_ld_v(s, MO_64, s->T1, s->A0);
+
+ // load slice_b value into T0
+ // A0, T0 initialized
+ tcg_gen_ld_tl(s->A0, tcg_env, va_base_offset);
+ gen_lea_v_seg(s, s->A0, R_ES, -1);
+ tcg_gen_mul_tl(s->T0, slice_b, s->tmp4);
+ tcg_gen_add_tl(s->A0, s->A0, s->T0);
+ tcg_gen_add_tl(s->A0, s->A0, s->tmp0);
+ gen_op_ld_v(s, MO_64, s->T0, s->A0);
+
+ // T0 holds the result of the operation
+ switch (op_type)
+ {
+ case SLICE_OP_TYPE_ADD:
+ tcg_gen_add_tl(s->T0, s->T1, s->T0);
+ break;
+ case SLICE_OP_TYPE_SUB:
+ tcg_gen_sub_tl(s->T0, s->T1, s->T0);
+ break;
+ case SLICE_OP_TYPE_MUL:
+ tcg_gen_mul_tl(s->T0, s->T1, s->T0);
+ break;
+ default:
+ gen_illegal_opcode(s);
+ return;
+ }
+
+ // Calculate address for slice_c slot
+ tcg_gen_ld_tl(s->A0, tcg_env, va_base_offset);
+ gen_lea_v_seg(s, s->A0, R_ES, -1);
+ tcg_gen_mul_tl(s->T1, slice_c, s->tmp4);
+ tcg_gen_add_tl(s->A0, s->A0, s->T1);
+ tcg_gen_add_tl(s->A0, s->A0, s->tmp0);
+ gen_op_st_v(s, MO_64, s->T0, s->A0);
+
+ tmp = gen_ext_tl(NULL, s->tmp0, s->aflag, false);
+ tcg_gen_brcondi_tl(TCG_COND_NE, tmp, 0, l1);
+ gen_set_label(l2);
+
+ tcg_gen_st_tl(tcg_constant_i64(0), tcg_env, access_offset);
+}
+
+static void gen_mts_8(DisasContext *s, MemOp ot)
+{
+ const size_t slice_size_offset = offsetof(CPUX86State, scratch_config.slice_size);
+ const size_t va_base_offset = offsetof(CPUX86State, scratch_config.va_base);
+ const size_t access_offset = offsetof(CPUX86State, scratch_config.access_enabled);
+
+ const TCGv slice_index = cpu_regs[R_EBX];
+ const TCGv offset_in_slice = cpu_regs[R_EDI];
+ const TCGv memory_address = cpu_regs[R_ESI];
+ const TCGv dshift = gen_compute_Dshift(s, ot);
+
+ tcg_gen_st_tl(tcg_constant_i64(1), tcg_env, access_offset);
+
+ // load from memory address
+ gen_lea_v_seg(s, memory_address, R_DS, -1);
+ gen_op_ld_v(s, MO_8, s->T0, s->A0);
+
+ // Calculate address for scratch
+ // A0 = offset_in_slice + slice_base + (slice_index * slice_size)
+ tcg_gen_ld_tl(s->A0, tcg_env, va_base_offset);
+ gen_lea_v_seg(s, s->A0, R_ES, -1);
+ tcg_gen_add_tl(s->A0, s->A0, offset_in_slice);
+ tcg_gen_ld32u_tl(s->tmp0, tcg_env, slice_size_offset);
+ tcg_gen_mul_tl(s->tmp0, s->tmp0, slice_index);
+ tcg_gen_add_tl(s->A0, s->A0, s->tmp0);
+
+ // Store value
+ gen_op_st_v(s, MO_8, s->T0, s->A0);
+
+ gen_op_add_reg(s, s->aflag, R_ESI, dshift);
+ gen_op_add_reg(s, s->aflag, R_EDI, dshift);
+
+ tcg_gen_st_tl(tcg_constant_i64(0), tcg_env, access_offset);
+}
+
+static void gen_stm_8(DisasContext *s, MemOp ot)
+{
+ const size_t va_base_offset = offsetof(CPUX86State, scratch_config.va_base);
+ const size_t slice_size_offset = offsetof(CPUX86State, scratch_config.slice_size);
+ const size_t access_offset = offsetof(CPUX86State, scratch_config.access_enabled);
+
+ const TCGv slice_index = cpu_regs[R_EBX];
+ const TCGv offset_in_slice = cpu_regs[R_EDI];
+ const TCGv memory_address = cpu_regs[R_ESI];
+ const TCGv dshift = gen_compute_Dshift(s, ot);
+
+ tcg_gen_st_tl(tcg_constant_i64(1), tcg_env, access_offset);
+
+ // Calculate address for scratch
+ // A0 = offset_in_slice + slice_base + (slice_index * slice_size)
+ tcg_gen_ld_tl(s->A0, tcg_env, va_base_offset);
+ gen_lea_v_seg(s, s->A0, R_ES, -1);
+ tcg_gen_add_tl(s->A0, s->A0, offset_in_slice);
+
+ tcg_gen_ld32u_tl(s->tmp0, tcg_env, slice_size_offset);
+ tcg_gen_mul_tl(s->tmp0, s->tmp0, slice_index);
+ tcg_gen_add_tl(s->A0, s->A0, s->tmp0);
+
+ // Load value from scratch
+ gen_op_ld_v(s, MO_8, s->T0, s->A0);
+
+ // Write to memory address
+ gen_lea_v_seg(s, memory_address, R_DS, -1);
+ gen_op_st_v(s, MO_8, s->T0, s->A0);
+
+ gen_op_add_reg(s, s->aflag, R_ESI, dshift);
+ gen_op_add_reg(s, s->aflag, R_EDI, dshift);
+
+ tcg_gen_st_tl(tcg_constant_i64(0), tcg_env, access_offset);
+}
+
static void gen_lods(DisasContext *s, MemOp ot)
{
gen_string_movl_A0_ESI(s);
--
2.34.1