/..

#CONTENT

#TOP

0001-Add-hack4-ai1337.patchTEXT
   1 
 
2
 
3
 
4
 
5
 
6
 
7
 
8
 
9
 
10
 
11
 
12
 
13
 
14
 
15
 
16
 
17
 
18
 
19
 
20
 
21
 
22
 
23
 
24
 
25
 
26
 
27
 
28
 
29
 
30
 
31
 
32
 
33
 
34
 
35
 
36
 
37
 
38
 
39
 
40
 
41
 
42
 
43
 
44
 
45
 
46
 
47
 
48
 
49
 
50
 
51
 
52
 
53
 
54
 
55
 
56
 
57
 
58
 
59
 
60
 
61
 
62
 
63
 
64
 
65
 
66
 
67
 
68
 
69
 
70
 
71
 
72
 
73
 
74
 
75
 
76
 
77
 
78
 
79
 
80
 
81
 
82
 
83
 
84
 
85
 
86
 
87
 
88
 
89
 
90
 
91
 
92
 
93
 
94
 
95
 
96
 
97
 
98
 
99
 
100
 
101
 
102
 
103
 
104
 
105
 
106
 
107
 
108
 
109
 
110
 
111
 
112
 
113
 
114
 
115
 
116
 
117
 
118
 
119
 
120
 
121
 
122
 
123
 
124
 
125
 
126
 
127
 
128
 
129
 
130
 
131
 
132
 
133
 
134
 
135
 
136
 
137
 
138
 
139
 
140
 
141
 
142
 
143
 
144
 
145
 
146
 
147
 
148
 
149
 
150
 
151
 
152
 
153
 
154
 
155
 
156
 
157
 
158
 
159
 
160
 
161
 
162
 
163
 
164
 
165
 
166
 
167
 
168
 
169
 
170
 
171
 
172
 
173
 
174
 
175
 
176
 
177
 
178
 
179
 
180
 
181
 
182
 
183
 
184
 
185
 
186
 
187
 
188
 
189
 
190
 
191
 
192
 
193
 
194
 
195
 
196
 
197
 
198
 
199
 
200
 
201
 
202
 
203
 
204
 
205
 
206
 
207
 
208
 
209
 
210
 
211
 
212
 
213
 
214
 
215
 
216
 
217
 
218
 
219
 
220
 
221
 
222
 
223
 
224
 
225
 
226
 
227
 
228
 
229
 
230
 
231
 
232
 
233
 
234
 
235
 
236
 
237
 
238
 
239
 
240
 
241
 
242
 
243
 
244
 
245
 
246
 
247
 
248
 
249
 
250
 
251
 
252
 
253
 
254
 
255
 
256
 
257
 
258
 
259
 
260
 
261
 
262
 
263
 
264
 
265
 
266
 
267
 
268
 
269
 
270
 
271
 
272
 
273
 
274
 
275
 
276
 
277
 
278
 
279
 
280
 
281
 
282
 
283
 
284
 
285
 
286
 
287
 
288
 
289
 
290
 
291
 
292
 
293
 
294
 
295
 
296
 
297
 
298
 
299
 
300
 
301
 
302
 
303
 
304
 
305
 
306
 
307
 
308
 
309
 
310
 
311
 
312
 
313
 
314
 
315
 
316
 
317
 
318
 
319
 
320
 
321
 
322
 
323
 
324
 
325
 
326
 
327
 
328
 
329
 
330
 
331
 
332
 
333
 
334
 
335
 
336
 
337
 
338
 
339
 
340
 
341
 
342
 
343
 
344
 
345
 
346
 
347
 
348
 
349
 
350
 
351
 
352
 
353
 
354
 
355
 
356
 
357
 
358
 
359
 
360
 
361
 
362
 
363
 
364
 
365
 
366
 
367
 
368
 
369
 
370
 
371
 
372
 
373
 
374
 
375
 
376
 
377
 
378
 
379
 
380
 
381
 
382
 
383
 
384
 
385
 
386
 
387
 
388
 
389
 
390
 
391
 
392
 
393
 
394
 
395
 
396
 
397
 
398
 
399
 
400
 
401
 
402
 
403
 
404
 
405
 
406
 
407
 
408
 
409
 
410
 
411
 
412
 
413
 
414
 
415
 
416
 
417
 
418
 
419
 
420
 
421
 
422
 
423
 
424
 
425
 
426
 
427
 
428
 
429
 
430
 
431
 
432
 
433
 
434
 
435
 
436
 
437
 
438
 
439
 
440
 
441
 
442
 
443
 
444
 
445
 
446
 
447
 
448
 
449
 
450
 
451
 
452
 
453
 
454
 
455
 
456
 
457
 
458
 
459
 
460
 
461
 
462
 
463
 
464
 
465
 
466
 
467
 
468
 
469
 
470
 
471
 
472
 
473
 
474
 
475
 
476
 
477
 
478
 
479
 
480
 
481
 
482
 
483
 
484
 
485
 
486
 
487
 
488
 
489
 
490
 
491
 
492
 
493
 
494
 
495
 
496
 
497
 
498
 
499
 
500
 
501
 
502
 
503
 
504
 
505
 
506
 
507
 
508
 
509
 
510
 
511
 
512
 
513
 
514
 
515
 
516
 
517
 
518
 
519
 
520
 
521
 
522
 
523
 
524
 
525
 
526
 
527
 
528
 
529
 
530
 
531
 
532
 
533
 
534
 
535
 
536
 
537
 
538
 
539
 
540
 
541
 
542
 
543
 
544
 
545
 
546
 
547
 
548
 
549
 
550
 
551
 
552
 
553
 
554
 
555
 
556
 
557
 
558
 
559
 
560
 
561
 
562
 
563
 
564
 
565
 
566
 
567
 
568
 
569
 
570
 
571
 
572
 
573
 
574
 
575
 
576
 
577
 
578
 
579
 
580
 
581
 
582
 
583
 
584
 
585
 
586
 
587
 
588
 
589
 
590
 
591
 
592
 
593
 
594
 
595
 
596
 
597
 
598
 
599
 
600
 
601
 
602
 
603
 
604
 
605
 
606
 
607
 
608
 
609
 
610
 
611
 
612
 
613
 
614
 
615
 
616
 
617
 
618
 
619
 
620
 
621
 
622
 
623
 
624
 
625
 
626
 
627
 
628
 
629
 
630
 
631
 
632
 
633
 
634
 
635
 
636
 
637
 
638
 
639
 
640
 
641
 
642
 
643
 
644
 
645
 
646
 
647
 
648
 
649
 
650
 
651
 
652
 
653
 
654
 
655
 
656
 
657
 
658
 
659
 
660
 
661
 
662
 
663
 
664
 
665
 
666
 
667
 
668
 
669
 
670
 
671
 
672
 
673
 
674
 
675
 
676
 
677
 
678
 
679
 
680
 
681
 
682
 
683
 
684
 
685
 
686
 
687
 
688
 
689
 
690
 
691
 
692
 
693
 
694
 
695
 
696
 
697
 
698
 
699
 
700
 
701
 
702
 
703
 
704
 
705
 
706
 
707
 
708
 
709
 
710
 
711
 
712
 
713
 
714
 
715
 
716
 
717
 
718
 
719
 
720
 
721
 
722
 
723
 
724
 
725
 
726
 
727
 
728
 
729
 
730
 
731
 
732
 
733
 
734
 
735
 
736
 
737
 
738
 
739
 
740
 
741
 
742
 
743
 
744
 
745
 
746
 
747
 
748
 
749
 
750
 
751
 
752
 
753
 
754
 
755
 
756
 
757
 
758
 
759
 
760
 
761
 
762
 
763
 
764
 
765
 
766
 
767
 
768
 
769
 
770
 
771
 
772
 
773
 
774
 
775
 
776
 
777
 
778
 
779
 
780
 
781
 
782
 
783
 
784
 
785
 
786
 
787
 
788
 
789
 
790
 
791
 
792
 
793
 
From 96ef36fd2c2544b7cc5b6c942247f52a4d450f99 Mon Sep 17 00:00:00 2001
From: sisu 
Date: Sun, 1 Dec 2024 18:18:49 +0200
Subject: [PATCH] Add hack4 (ai1337)

---
 docs/specs/hxp_ai1337.rst            | 192 ++++++++++++++++++++++++++
 target/i386/cpu.c                    |  52 +++++++
 target/i386/cpu.h                    |  17 +++
 target/i386/ops_ai1337.h             |   8 ++
 target/i386/tcg/decode-new.c.inc     |  17 +++
 target/i386/tcg/emit.c.inc           |  47 +++++++
 target/i386/tcg/sysemu/excp_helper.c |  12 ++
 target/i386/tcg/sysemu/misc_helper.c |  40 ++++++
 target/i386/tcg/translate.c          | 196 +++++++++++++++++++++++++++
 9 files changed, 581 insertions(+)
 create mode 100644 docs/specs/hxp_ai1337.rst
 create mode 100644 target/i386/ops_ai1337.h

diff --git a/docs/specs/hxp_ai1337.rst b/docs/specs/hxp_ai1337.rst
new file mode 100644
index 000000000..95b6d4280
--- /dev/null
+++ b/docs/specs/hxp_ai1337.rst
@@ -0,0 +1,192 @@
+HXP HACK-4 AI1337 Device Specification
+======================================
+
+The HXP HACK-4 AI1337 is designed to fulfil the compute needs
+of the AI industry. The design is a significant extension to
+the existing X86 architecture to enable fast scratch operations.
+
+High-Level Architecture
+=======================
+
+This section provides a high-level overview of the HXP HACK-4 and
+AI1337 architecture.
+
+Processor Organization
+----------------------
+
+::
+
+  HXP HACK-4 application processor, optimized for scalar compute
+  AI1337 Engine, optimized for very-wide compute
+
+              |------------|---------|-----------|
+              |            |---------|           |
+              | HXP HACK-4 |---------| AI1337 IP |
+              |            |---------|           |
+              |------------|---------|-----------|
+                                |
+                                |
+                      PSCHORR interconnect
+
+  PSCHORR very-wide link
+
+The HXP HACK-4 is the application processor responsible for boot and
+executing OS software. The AI1337 execution engine is on-die engine
+responsible for fast scratch operations.
+
+AI1337 Engine Organization
+--------------------------
+
+::
+
+  --------------------------------------------
+  |               AI1337 engine              |  Execution Interconnect
+  |                                          |  |
+  |------------------------------------------|  |
+  | Slice 0                                  |-----|   --------------------
+  |------------------------------------------|     |---| Multi-ALU engine |
+  | Slice 1                                  |-----|   --------------------
+  |------------------------------------------|     |
+  | Slice 2                                  |-----|   --------------------
+  |------------------------------------------|     |---| Multi-ALU engine |
+  | ...                                      |-----|   --------------------
+  |------------------------------------------|     |
+  | Slice N                                  |-----|   --------------------
+  |------------------------------------------|     |---| Multi-ALU engine |
+                                                       --------------------
+
+The AI1337 engine is organized as a vector of interconnected memory slices.
+Slices are interconnected via the 'execution interconnect' in an N-to-N
+fashion, and each cross-slice wide-link is connected to a series of
+multi-ALU engines that support fast addition, subtraction and multiplication.
+
+PSCHORR Interconnect
+--------------------
+
+The PSCHORR Interconnect connects the HACK-4 application processor
+and the AI1337 Engine using a multi-link organization for fast
+slice reads and writes.
+
+The interconnect allows also for addressability of the scratch memory
+through an bi-ATS unit that supports bi-directional addressing of scratch
+and application processor memory.
+
+::
+
+      Physical Memory                Virtual Memory
+            0                               |
+            |                               |
+  IO space  |                               |
+            |                               |
+            -                               |
+            |                               |
+            |                               |
+            |                               |   Direct Addressing
+    RAM     |                               |           |
+            |    ___________________________|_____      |
+            |   /                       |        |      |
+            ---/                        | bi-ATS |------|
+            |                           |        |
+            |    _______________________|________|
+  AI1337    |   /
+ aperture   |  /  PSCHORR Interconnect
+            ---
+
+ISA Contributions
+=================
+
+This section describes the ISA contributions to the X86_64 ISA.
+The added instructions are responsible for updating scratch memory
+on the AI1337 engine and for submitting work to the AI1337 engine.
+The ISA also includes instructions for fast reconfiguration of the
+PSCHORR interconnect.
+
+
+.. list-table:: ISA
+   :widths: 25 25 50
+   :header-rows: 1
+
+   * - Opcode
+     - Instruction
+     - Description
+   * - 0F 0A 83
+     - MTS
+     - Load RCX bytes from memory address (RSI) to slice (RBX) at slice offset (RDI)
+   * - 0F 0A 84
+     - STM
+     - Read RCX bytes from slice (RBX) at slice offset (RDI) and write memory address (RSI) 
+   * - 0F 0A 85
+     - FSCR
+     - Clear all slices
+   * - 0F 0A 86
+     - SCRADD
+     - Add the slices pointed by RDI and RSI, and store the result into slice pointed by RDX
+   * - 0F 0A 87
+     - SCRSUB
+     - Subtract the slices pointed by RDI and RSI, and store the result into slice pointed by RDX
+   * - 0F 0A 88
+     - SCRMUL
+     - Multiply the slices pointed by RDI and RSI, and store the result into slice pointed by RDX
+   * - 0F 0A 89
+     - SCRHLW (privileged)
+     - Update scratch memory PSCHORR bi-ATS base VA
+   * - 0F 0A 8A
+     - SCRHLR
+     - Read scratch memory PSCHORR bi-ATS base VA
+
+System-Level Contributions
+==========================
+
+This section provides information on system-level specification and configuration,
+and it's primarily targeted towards kernel developers.
+
+Specification
+-------------
+
+The AI1337 engine support is dictated by the existence of the 0x80000022 CPUID leaf.
+If the AI1337 CPUID leaf exists, the EAX, ECX, EDX and EBX registers provide the following information:
+
+.. list-table:: CPUID 0x80000022
+   :widths: 25 25 50
+   :header-rows: 1
+
+   * - Register
+     - Bits
+     - Information
+   * - EAX
+     - 0-31
+     - Total scratch memory size
+   * - ECX
+     - 0-9
+     - Maximum number of slices
+   * - ECX
+     - 10-31
+     - Maximum slice size in bytes
+   * - EDX
+     - 0-31
+     - Low 32 bits of the AI1337 Aperture
+   * - EBX
+     - 0-31
+     - High 32 bits of the AI1337 Aperture
+
+Configuration
+-------------
+
+The AI1337 engine is a multi-configurable engine that software can
+utilize for scaling up for high-computing workloads and scaling
+down for power-efficiency.
+
+.. list-table:: MSR
+   :widths: 40 25 50
+   :header-rows: 1
+
+   * - MSR
+     - Identifier
+     - Description
+   * - MSR_HACK4_SLICE_SIZE
+     - 0xC0000105
+     - Read/Write slice size in the AI1337 engine
+   * - MSR_HACK4_NUM_SLICES
+     - 0xC0000106
+     - Read/Write count of slices in the AI1337 engine
+
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 85ef7452c..197a813f7 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -43,9 +43,13 @@
 #include "hw/i386/sgx-epc.h"
 #endif
 
+#include "exec/ramblock.h"
+
 #include "disas/capstone.h"
 #include "cpu-internal.h"
 
+#include "ops_ai1337.h"
+
 static void x86_cpu_realizefn(DeviceState *dev, Error **errp);
 
 /* Helpers for building CPUID[2] descriptors: */
@@ -5256,6 +5260,26 @@ static const X86CPUDefinition builtin_x86_defs[] = {
         .model_id = "AMD EPYC-Genoa Processor",
         .cache_info = &epyc_genoa_cache_info,
     },
+    {
+        .name = "hxp-ai1337",
+        .level = 0xd,
+        .vendor = CPUID_VENDOR_AMD,
+        .family = 25,
+        .model = 1,
+        .stepping = 1,
+        .features[FEAT_1_EDX] =
+            PPRO_FEATURES |
+            CPUID_MTRR | CPUID_CLFLUSH | CPUID_MCA |
+            CPUID_PSE36,
+        .features[FEAT_1_ECX] =
+            CPUID_EXT_SSE3 | CPUID_EXT_CX16 | CPUID_EXT_RDRAND,
+        .features[FEAT_8000_0001_EDX] =
+            CPUID_EXT2_LM | CPUID_EXT2_SYSCALL | CPUID_EXT2_NX,
+        .features[FEAT_8000_0001_ECX] =
+            CPUID_EXT3_LAHF_LM | CPUID_EXT3_SVM,
+        .xlevel = 0x80000022,
+        .model_id = "HXP Silicon Foundaries AI 1337 Processor",
+    },
 };
 
 /*
@@ -5688,6 +5712,11 @@ static inline void feat2prop(char *s)
     }
 }
 
+uint64_t x86_calculate_scratch_size(CPUX86State* env)
+{
+    return (env->scratch_config.slice_size * env->scratch_config.num_active_slices);
+}
+
 /* Return the feature property name for a feature flag bit */
 static const char *x86_cpu_feature_name(FeatureWord w, int bitnr)
 {
@@ -7044,6 +7073,13 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count,
         *eax = env->features[FEAT_8000_0021_EAX];
         *ebx = *ecx = *edx = 0;
         break;
+    case 0x80000022:
+        *eax = *ebx = *ecx = *edx = 0;
+        *ecx = (AI1337_SCRATCH_MAX_SLICE_SIZE << 10) | AI1337_SCRATCH_MAX_NUM_SLICES;
+        *eax = AI1337_SCRATCH_SIZE;
+        *edx = (AI1337_SCRATCH_PHYS_BASE & 0xFFFFFFFFU);
+        *ebx = ((AI1337_SCRATCH_PHYS_BASE >> 32U) & 0xFFFFFFFFU);
+        break;
     default:
         /* reserved values: zero */
         *eax = 0;
@@ -8052,6 +8088,22 @@ static void x86_cpu_initfn(Object *obj)
     if (xcc->model) {
         x86_cpu_load_model(cpu, xcc->model);
     }
+
+    {
+        env->scratch_config.num_active_slices = AI1337_SCRATCH_NUM_SLICES_DEFAULT;
+        env->scratch_config.slice_size = AI1337_SCRATCH_SLICE_SIZE_DEFAULT;
+        env->scratch_config.va_base = AI1337_SCRATCH_VA_BASE;
+        env->scratch_config.phys_base = AI1337_SCRATCH_PHYS_BASE;
+        env->scratch_config.access_enabled = 0;
+
+        uint16_t scratch[AI1337_SCRATCH_SIZE];
+        env->scratch_region = malloc(sizeof(MemoryRegion));
+        memset(env->scratch_region, 0, sizeof(*env->scratch_region));
+        memory_region_init_ram_ptr(env->scratch_region, NULL, "ai1337-scratch", AI1337_SCRATCH_SIZE, scratch);
+        env->scratch_region->ram_block->flags |= RAM_RESIZEABLE;
+        env->scratch_region->ram_block->max_length = AI1337_SCRATCH_MAX_NUM_SLICES * AI1337_SCRATCH_MAX_SLICE_SIZE;
+        memory_region_add_subregion(get_system_memory(), AI1337_SCRATCH_PHYS_BASE, env->scratch_region);
+    }
 }
 
 static int64_t x86_cpu_get_arch_id(CPUState *cs)
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 14edd57a3..778c9a730 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -544,6 +544,9 @@ typedef enum X86Seg {
 #define MSR_IA32_XFD                    0x000001c4
 #define MSR_IA32_XFD_ERR                0x000001c5
 
+#define MSR_HACK4_SLICE_SIZE            0xc0000105
+#define MSR_HACK4_NUM_SLICES            0xc0000106
+
 /* FRED MSRs */
 #define MSR_IA32_FRED_RSP0              0x000001cc       /* Stack level 0 regular stack pointer */
 #define MSR_IA32_FRED_RSP1              0x000001cd       /* Stack level 1 regular stack pointer */
@@ -1681,6 +1684,14 @@ typedef struct HVFX86LazyFlags {
     target_ulong auxbits;
 } HVFX86LazyFlags;
 
+typedef struct ScratchConfig {
+    uint64_t va_base;
+    uint64_t phys_base;
+    size_t num_active_slices;
+    size_t slice_size;
+    int access_enabled;
+} ScratchConfig;
+
 typedef struct CPUArchState {
     /* standard registers */
     target_ulong regs[CPU_NB_REGS];
@@ -1996,6 +2007,10 @@ typedef struct CPUArchState {
 
     /* Bitmap of available CPU topology levels for this CPU. */
     DECLARE_BITMAP(avail_cpu_topo, CPU_TOPO_LEVEL_MAX);
+
+    MemoryRegion *scratch_region;
+    ScratchConfig scratch_config;
+
 } CPUX86State;
 
 struct kvm_msrs;
@@ -2639,6 +2654,8 @@ void x86_cpu_xsave_all_areas(X86CPU *cpu, void *buf, uint32_t buflen);
 uint32_t xsave_area_size(uint64_t mask, bool compacted);
 void x86_update_hflags(CPUX86State* env);
 
+uint64_t x86_calculate_scratch_size(CPUX86State* env);
+
 static inline bool hyperv_feat_enabled(X86CPU *cpu, int feat)
 {
     return !!(cpu->hyperv_features & BIT(feat));
diff --git a/target/i386/ops_ai1337.h b/target/i386/ops_ai1337.h
new file mode 100644
index 000000000..7aea6ae78
--- /dev/null
+++ b/target/i386/ops_ai1337.h
@@ -0,0 +1,8 @@
+
+#define AI1337_SCRATCH_VA_BASE 0xFFFFFFFFFFA00000ULL
+#define AI1337_SCRATCH_PHYS_BASE 0xFFFFFFFFFFF00000ULL
+#define AI1337_SCRATCH_SIZE (33ULL * 1024)
+#define AI1337_SCRATCH_MAX_NUM_SLICES (128)
+#define AI1337_SCRATCH_SLICE_SIZE_DEFAULT (1024ULL)
+#define AI1337_SCRATCH_NUM_SLICES_DEFAULT (33UL)
+#define AI1337_SCRATCH_MAX_SLICE_SIZE (4096ULL)
diff --git a/target/i386/tcg/decode-new.c.inc b/target/i386/tcg/decode-new.c.inc
index 30be9237c..968042464 100644
--- a/target/i386/tcg/decode-new.c.inc
+++ b/target/i386/tcg/decode-new.c.inc
@@ -1032,6 +1032,21 @@ static void decode_0F5A(DisasContext *s, CPUX86State *env, X86OpEntry *entry, ui
     *entry = *decode_by_prefix(s, opcodes_0F5A);
 }
 
+static void decode_0F0A(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+    uint8_t c = x86_ldub_code(env, s);
+    switch (c) {
+    case 0x83: entry->gen = gen_MTS; break;
+    case 0x84: entry->gen = gen_STM; break;
+    case 0x85: entry->gen = gen_FSCR; break;
+    case 0x86: entry->gen = gen_SCRADD; break;
+    case 0x87: entry->gen = gen_SCRSUB; break;
+    case 0x88: entry->gen = gen_SCRMUL; break;
+    case 0x89: entry->gen = gen_SCRHLW; break;
+    case 0x8a: entry->gen = gen_SCRHLR; break;
+    }
+}
+
 static void decode_0F5B(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
 {
     static const X86OpEntry opcodes_0F5B[4] = {
@@ -1273,6 +1288,8 @@ static const X86OpEntry opcodes_0F[256] = {
     [0x7e] = X86_OP_GROUP0(0F7E),
     [0x7f] = X86_OP_GROUP0(0F7F),
 
+    [0x0a] = X86_OP_GROUP0(0F0A),
+
     [0x88] = X86_OP_ENTRYr(Jcc, J,z_f64),
     [0x89] = X86_OP_ENTRYr(Jcc, J,z_f64),
     [0x8a] = X86_OP_ENTRYr(Jcc, J,z_f64),
diff --git a/target/i386/tcg/emit.c.inc b/target/i386/tcg/emit.c.inc
index 9b5041991..9a2e57b8f 100644
--- a/target/i386/tcg/emit.c.inc
+++ b/target/i386/tcg/emit.c.inc
@@ -3853,6 +3853,53 @@ static void gen_SUB(DisasContext *s, X86DecodedInsn *decode)
     prepare_update2_cc(decode, s, CC_OP_SUBB + ot);
 }
 
+static void gen_MTS(DisasContext *s, X86DecodedInsn *decode)
+{
+    gen_repz(s, MO_8, gen_mts_8);
+}
+
+static void gen_FSCR(DisasContext *s, X86DecodedInsn *decode)
+{
+    gen_fscr(s);
+}
+
+static void gen_SCRHLW(DisasContext *s, X86DecodedInsn *decode)
+{
+    if (CPL(s) != 0)
+    {
+        gen_illegal_opcode(s);
+        return;
+    }
+    size_t va_base_offset = offsetof(CPUX86State, scratch_config.va_base);
+    tcg_gen_st_tl(cpu_regs[R_EDI], tcg_env, va_base_offset);
+}
+
+static void gen_SCRHLR(DisasContext *s, X86DecodedInsn *decode)
+{
+    size_t va_base_offset = offsetof(CPUX86State, scratch_config.va_base);
+    tcg_gen_ld_tl(cpu_regs[R_EAX], tcg_env, va_base_offset);
+}
+
+static void gen_STM(DisasContext *s, X86DecodedInsn *decode)
+{
+    gen_repz(s, MO_8, gen_stm_8);
+}
+
+static void gen_SCRADD(DisasContext *s, X86DecodedInsn *decode)
+{
+    gen_slice_op(s, SLICE_OP_TYPE_ADD);
+}
+
+static void gen_SCRSUB(DisasContext *s, X86DecodedInsn *decode)
+{
+    gen_slice_op(s, SLICE_OP_TYPE_SUB);
+}
+
+static void gen_SCRMUL(DisasContext *s, X86DecodedInsn *decode)
+{
+    gen_slice_op(s, SLICE_OP_TYPE_MUL);
+}
+
 static void gen_SYSCALL(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_update_cc_op(s);
diff --git a/target/i386/tcg/sysemu/excp_helper.c b/target/i386/tcg/sysemu/excp_helper.c
index 8fb05b1f5..f524f97c2 100644
--- a/target/i386/tcg/sysemu/excp_helper.c
+++ b/target/i386/tcg/sysemu/excp_helper.c
@@ -23,6 +23,7 @@
 #include "exec/exec-all.h"
 #include "exec/page-protection.h"
 #include "tcg/helper-tcg.h"
+#include "../../ops_ai1337.h"
 
 typedef struct TranslateParams {
     target_ulong addr;
@@ -600,6 +601,17 @@ bool x86_cpu_tlb_fill(CPUState *cs, vaddr addr, int size,
     TranslateResult out;
     TranslateFault err;
 
+    if (env->scratch_config.access_enabled &&
+        (addr >= env->scratch_config.va_base) &&
+        ((addr + size) <= (env->scratch_config.va_base + x86_calculate_scratch_size(env)))) {
+        vaddr paddr = env->scratch_config.phys_base + (addr - env->scratch_config.va_base);
+        tlb_set_page_with_attrs(cs, addr & TARGET_PAGE_MASK,
+                                paddr & TARGET_PAGE_MASK,
+                                cpu_get_mem_attrs(env),
+                                PAGE_READ | PAGE_WRITE | PAGE_EXEC, mmu_idx, TARGET_PAGE_SIZE);
+        return true;
+    }
+
     if (get_physical_address(env, addr, access_type, mmu_idx, &out, &err,
                              retaddr)) {
         /*
diff --git a/target/i386/tcg/sysemu/misc_helper.c b/target/i386/tcg/sysemu/misc_helper.c
index 094aa56a2..78fd3a573 100644
--- a/target/i386/tcg/sysemu/misc_helper.c
+++ b/target/i386/tcg/sysemu/misc_helper.c
@@ -26,6 +26,7 @@
 #include "exec/exec-all.h"
 #include "tcg/helper-tcg.h"
 #include "hw/i386/apic.h"
+#include "../../ops_ai1337.h"
 
 void helper_outb(CPUX86State *env, uint32_t port, uint32_t data)
 {
@@ -128,6 +129,27 @@ void helper_write_crN(CPUX86State *env, int reg, target_ulong t0)
     }
 }
 
+static bool helper_recalculate_scratch(CPUX86State *env, uint32_t new_num_slices, uint32_t new_slice_size)
+{
+    if (new_num_slices > AI1337_SCRATCH_MAX_NUM_SLICES) {
+        return false;
+    }
+    if (new_slice_size > AI1337_SCRATCH_MAX_SLICE_SIZE) {
+        return false;
+    }
+    uint32_t new_size = new_num_slices * new_slice_size;
+    Error *err = NULL;
+    bql_lock();
+    memory_region_ram_resize(env->scratch_region, new_size, &err);
+    bql_unlock();
+    if (err) {
+        return false;
+    }
+    env->scratch_config.num_active_slices = new_num_slices;
+    env->scratch_config.slice_size = new_slice_size;
+    return true;
+}
+
 void helper_wrmsr(CPUX86State *env)
 {
     uint64_t val;
@@ -306,6 +328,18 @@ void helper_wrmsr(CPUX86State *env)
 
         break;
     }
+    case MSR_HACK4_SLICE_SIZE:
+        const uint32_t new_slice_size = val;
+        if (!helper_recalculate_scratch(env, env->scratch_config.num_active_slices, new_slice_size)) {
+            goto error;
+        }
+        break;
+    case MSR_HACK4_NUM_SLICES:
+        const uint32_t new_num_active_slices = val;
+        if (!helper_recalculate_scratch(env, new_num_active_slices, env->scratch_config.slice_size)) {
+            goto error;
+        }
+        break;
     default:
         if ((uint32_t)env->regs[R_ECX] >= MSR_MC0_CTL
             && (uint32_t)env->regs[R_ECX] < MSR_MC0_CTL +
@@ -333,6 +367,12 @@ void helper_rdmsr(CPUX86State *env)
     cpu_svm_check_intercept_param(env, SVM_EXIT_MSR, 0, GETPC());
 
     switch ((uint32_t)env->regs[R_ECX]) {
+    case MSR_HACK4_SLICE_SIZE:
+        val = env->scratch_config.slice_size;
+        break;
+    case MSR_HACK4_NUM_SLICES:
+        val = env->scratch_config.num_active_slices;
+        break;
     case MSR_IA32_SYSENTER_CS:
         val = env->sysenter_cs;
         break;
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index 98f5fe61e..0fd28c60f 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -21,6 +21,7 @@
 #include "qemu/host-utils.h"
 #include "cpu.h"
 #include "exec/exec-all.h"
+#include "tcg/tcg-op-common.h"
 #include "tcg/tcg-op.h"
 #include "tcg/tcg-op-gvec.h"
 #include "exec/translator.h"
@@ -32,6 +33,8 @@
 
 #include "exec/log.h"
 
+#include "ops_ai1337.h"
+
 #define HELPER_H "helper.h"
 #include "exec/helper-info.c.inc"
 #undef  HELPER_H
@@ -1198,6 +1201,199 @@ static void gen_stos(DisasContext *s, MemOp ot)
     gen_op_add_reg(s, s->aflag, R_EDI, gen_compute_Dshift(s, ot));
 }
 
+static void gen_fscr(DisasContext *s)
+{
+    TCGLabel *l1 = gen_new_label();
+    TCGLabel *l2 = gen_new_label();
+
+    const size_t slice_size_offset = offsetof(CPUX86State, scratch_config.slice_size);
+    const size_t slice_count_offset = offsetof(CPUX86State, scratch_config.num_active_slices);
+    const size_t va_base_offset = offsetof(CPUX86State, scratch_config.va_base);
+    const size_t access_offset = offsetof(CPUX86State, scratch_config.access_enabled);
+
+    tcg_gen_st_tl(tcg_constant_i64(1), tcg_env, access_offset);
+
+    // Calculate size
+    tcg_gen_ld32u_tl(s->tmp0, tcg_env, slice_size_offset);
+    tcg_gen_ld32u_tl(s->tmp4, tcg_env, slice_count_offset);
+    tcg_gen_mul_tl(s->tmp0, s->tmp0, s->tmp4);
+
+    // For loop to clear memory
+    gen_set_label(l1);
+    gen_update_cc_op(s);
+    TCGv tmp = gen_ext_tl(NULL, s->tmp0, s->aflag, false);
+    tcg_gen_brcondi_tl(TCG_COND_EQ, tmp, 0, l2);
+    tcg_gen_sub_tl(s->tmp0, s->tmp0, tcg_constant_i64(1));
+    tcg_gen_ld_tl(s->A0, tcg_env, va_base_offset);
+    gen_lea_v_seg(s, s->A0, R_ES, -1);
+    tcg_gen_add_tl(s->A0, s->A0, s->tmp0);
+    gen_op_st_v(s, MO_8, tcg_constant_i64(0), s->A0);
+    tmp = gen_ext_tl(NULL, s->tmp0, s->aflag, false);
+    tcg_gen_brcondi_tl(TCG_COND_NE, tmp, 0, l1);
+    gen_set_label(l2);
+
+    tcg_gen_st_tl(tcg_constant_i64(0), tcg_env, access_offset);
+}
+
+typedef enum SLICE_OP_TYPE {
+    SLICE_OP_TYPE_ADD,
+    SLICE_OP_TYPE_SUB,
+    SLICE_OP_TYPE_MUL,
+} SLICE_OP_TYPE;
+
+static void gen_illegal_opcode(DisasContext *s);
+
+static void gen_slice_op(DisasContext *s, SLICE_OP_TYPE op_type)
+{
+    TCGLabel *l1 = gen_new_label();
+    TCGLabel *l2 = gen_new_label();
+
+    const size_t slice_size_offset = offsetof(CPUX86State, scratch_config.slice_size);
+    const size_t va_base_offset = offsetof(CPUX86State, scratch_config.va_base);
+    const size_t access_offset = offsetof(CPUX86State, scratch_config.access_enabled);
+
+    const TCGv slice_a = cpu_regs[R_EDI];
+    const TCGv slice_b = cpu_regs[R_ESI];
+    const TCGv slice_c = cpu_regs[R_EDX];
+
+    tcg_gen_st_tl(tcg_constant_i64(1), tcg_env, access_offset);
+
+    // slice size
+    tcg_gen_ld32u_tl(s->tmp0, tcg_env, slice_size_offset);
+
+    // tmp4 always holds the const slice size
+    tcg_gen_mov_tl(s->tmp4, s->tmp0);
+
+    // For loop to clear memory
+    gen_set_label(l1);
+    gen_update_cc_op(s);
+    TCGv tmp = gen_ext_tl(NULL, s->tmp0, s->aflag, false);
+    tcg_gen_brcondi_tl(TCG_COND_EQ, tmp, 0, l2);
+
+    // slice_size -= 8
+    tcg_gen_sub_tl(s->tmp0, s->tmp0, tcg_constant_i64(8));
+
+    // load slice_a value into T1
+    // A0, T1 initialized
+    tcg_gen_ld_tl(s->A0, tcg_env, va_base_offset);
+    gen_lea_v_seg(s, s->A0, R_ES, -1);
+    tcg_gen_mul_tl(s->T1, slice_a, s->tmp4);
+    tcg_gen_add_tl(s->A0, s->A0, s->T1);
+    tcg_gen_add_tl(s->A0, s->A0, s->tmp0);
+    gen_op_ld_v(s, MO_64, s->T1, s->A0);
+
+    // load slice_b value into T0
+    // A0, T0 initialized
+    tcg_gen_ld_tl(s->A0, tcg_env, va_base_offset);
+    gen_lea_v_seg(s, s->A0, R_ES, -1);
+    tcg_gen_mul_tl(s->T0, slice_b, s->tmp4);
+    tcg_gen_add_tl(s->A0, s->A0, s->T0);
+    tcg_gen_add_tl(s->A0, s->A0, s->tmp0);
+    gen_op_ld_v(s, MO_64, s->T0, s->A0);
+
+    // T0 holds the result of the operation
+    switch (op_type)
+    {
+    case SLICE_OP_TYPE_ADD:
+        tcg_gen_add_tl(s->T0, s->T1, s->T0);
+        break;
+    case SLICE_OP_TYPE_SUB:
+        tcg_gen_sub_tl(s->T0, s->T1, s->T0);
+        break;
+    case SLICE_OP_TYPE_MUL:
+        tcg_gen_mul_tl(s->T0, s->T1, s->T0);
+        break;
+    default:
+        gen_illegal_opcode(s);
+        return;
+    }
+
+    // Calculate address for slice_c slot
+    tcg_gen_ld_tl(s->A0, tcg_env, va_base_offset);
+    gen_lea_v_seg(s, s->A0, R_ES, -1);
+    tcg_gen_mul_tl(s->T1, slice_c, s->tmp4);
+    tcg_gen_add_tl(s->A0, s->A0, s->T1);
+    tcg_gen_add_tl(s->A0, s->A0, s->tmp0);
+    gen_op_st_v(s, MO_64, s->T0, s->A0);
+
+    tmp = gen_ext_tl(NULL, s->tmp0, s->aflag, false);
+    tcg_gen_brcondi_tl(TCG_COND_NE, tmp, 0, l1);
+    gen_set_label(l2);
+
+    tcg_gen_st_tl(tcg_constant_i64(0), tcg_env, access_offset);
+}
+
+static void gen_mts_8(DisasContext *s, MemOp ot)
+{
+    const size_t slice_size_offset = offsetof(CPUX86State, scratch_config.slice_size);
+    const size_t va_base_offset = offsetof(CPUX86State, scratch_config.va_base);
+    const size_t access_offset = offsetof(CPUX86State, scratch_config.access_enabled);
+
+    const TCGv slice_index = cpu_regs[R_EBX];
+    const TCGv offset_in_slice = cpu_regs[R_EDI];
+    const TCGv memory_address = cpu_regs[R_ESI];
+    const TCGv dshift = gen_compute_Dshift(s, ot);
+
+    tcg_gen_st_tl(tcg_constant_i64(1), tcg_env, access_offset);
+
+    // load from memory address
+    gen_lea_v_seg(s, memory_address, R_DS, -1);
+    gen_op_ld_v(s, MO_8, s->T0, s->A0);
+
+    // Calculate address for scratch
+    // A0 = offset_in_slice + slice_base + (slice_index * slice_size)
+    tcg_gen_ld_tl(s->A0, tcg_env, va_base_offset);
+    gen_lea_v_seg(s, s->A0, R_ES, -1);
+    tcg_gen_add_tl(s->A0, s->A0, offset_in_slice);
+    tcg_gen_ld32u_tl(s->tmp0, tcg_env, slice_size_offset);
+    tcg_gen_mul_tl(s->tmp0, s->tmp0, slice_index);
+    tcg_gen_add_tl(s->A0, s->A0, s->tmp0);
+
+    // Store value
+    gen_op_st_v(s, MO_8, s->T0, s->A0);
+
+    gen_op_add_reg(s, s->aflag, R_ESI, dshift);
+    gen_op_add_reg(s, s->aflag, R_EDI, dshift);
+
+    tcg_gen_st_tl(tcg_constant_i64(0), tcg_env, access_offset);
+}
+
+static void gen_stm_8(DisasContext *s, MemOp ot)
+{
+    const size_t va_base_offset = offsetof(CPUX86State, scratch_config.va_base);
+    const size_t slice_size_offset = offsetof(CPUX86State, scratch_config.slice_size);
+    const size_t access_offset = offsetof(CPUX86State, scratch_config.access_enabled);
+
+    const TCGv slice_index = cpu_regs[R_EBX];
+    const TCGv offset_in_slice = cpu_regs[R_EDI];
+    const TCGv memory_address = cpu_regs[R_ESI];
+    const TCGv dshift = gen_compute_Dshift(s, ot);
+
+    tcg_gen_st_tl(tcg_constant_i64(1), tcg_env, access_offset);
+
+    // Calculate address for scratch
+    // A0 = offset_in_slice + slice_base + (slice_index * slice_size)
+    tcg_gen_ld_tl(s->A0, tcg_env, va_base_offset);
+    gen_lea_v_seg(s, s->A0, R_ES, -1);
+    tcg_gen_add_tl(s->A0, s->A0, offset_in_slice);
+
+    tcg_gen_ld32u_tl(s->tmp0, tcg_env, slice_size_offset);
+    tcg_gen_mul_tl(s->tmp0, s->tmp0, slice_index);
+    tcg_gen_add_tl(s->A0, s->A0, s->tmp0);
+
+    // Load value from scratch
+    gen_op_ld_v(s, MO_8, s->T0, s->A0);
+
+    // Write to memory address
+    gen_lea_v_seg(s, memory_address, R_DS, -1);
+    gen_op_st_v(s, MO_8, s->T0, s->A0);
+
+    gen_op_add_reg(s, s->aflag, R_ESI, dshift);
+    gen_op_add_reg(s, s->aflag, R_EDI, dshift);
+
+    tcg_gen_st_tl(tcg_constant_i64(0), tcg_env, access_offset);
+}
+
 static void gen_lods(DisasContext *s, MemOp ot)
 {
     gen_string_movl_A0_ESI(s);
-- 
2.34.1