Mandelbrot Generator
Posté le 15/06/2019 10:38
Hello allô
Default 1x zoom takes 7sec
Max zoom takes around 5-10min
It has a max zoom of 2^50: over one Quadrillion!
Going over 2^48 can be rather buggy
This is because numbers are limited to the 8 byte double variables
Attached file is both SH4 and SH3 compatible:
MANDEL.G1A
This does need the 'MonochromeLib' libs the code comes with it now
Controls
[-] Zoom out
[+] Zoom in
[F1] Hide/show HUD which contains Cords, Zoom level and Max Iterations. (Heads Up Display)
[F2] Changes colours of camera rectangle: Black, White & Inverted
[AC] Resets screen back to default state
[EXE] Draw set
[EXIT] Stop drawing the Mandelbrot (If it's taking too long)
[MENU] Return to the menu screen
[REPLAY] Move camera rectangle around (Arrow Keys: [LEFT], [RIGHT], [UP], [DOWN])
How can I optimize this code to run faster or zoom in further?
#include "fxlib.h"
#include "stdio.h"
#define TRUE 1
#define FALSE 0
#define ML_vram_adress (*(sc_cpv)sc0135)
typedef enum { ML_TRANSPARENT = -1, ML_WHITE, ML_BLACK, ML_XOR, ML_CHECKER } ML_Color;
typedef char* (*sc_cpv)(void);
const unsigned int sc0135[] = { 0xD201D002, 0x422B0009, 0x80010070, 0x0135 };
unsigned int key; //pause until key press
int kcode1, kcode2; //row & col keycode for Bkey_GetKeyWait()
char unused; //unused (cause CASIO dumb dumb)
unsigned short dispX, dispY; //cords on display when drawing mandelbrot
void ML_clear_vram() {
int i, end, * pointer_long, vram;
char* pointer_byte;
vram = (int)ML_vram_adress();
end = 4 - vram & 3;
pointer_byte = (char*)vram;
for (i = 0; i < end; i++) pointer_byte[i] = 0;
pointer_long = (int*)(vram + end);
for (i = 0; i < 255; i++) pointer_long[i] = 0;
pointer_byte += 1020 + end;
end = vram & 3;
for (i = 0; i < end; i++) pointer_byte[i] = 0;
}
void ML_display_vram() {
char* LCD_register_selector = (char*)0xB4000000, * LCD_data_register = (char*)0xB4010000, * vram;
int i, j;
vram = ML_vram_adress();
for (i = 0; i < 64; i++) {
*LCD_register_selector = 4;
*LCD_data_register = i | 192;
*LCD_register_selector = 4;
*LCD_data_register = 0;
*LCD_register_selector = 7;
for (j = 0; j < 16; j++)
*LCD_data_register = *vram++;
}
}
void ML_display_vram_row(int row) { //faster than ML_display_vram() which displays the entire screen instead of a single row
unsigned char i;
char* LCD_register_selector = (char*)0xB4000000, *LCD_data_register = (char*)0xB4010000, *vram;
vram = (row << 4) + ML_vram_adress();
*LCD_register_selector = 4;
*LCD_data_register = row | 192;
*LCD_register_selector = 4;
*LCD_data_register = 0;
*LCD_register_selector = 7;
for (i = 0; i < 16; i++)
* LCD_data_register = *vram++;
}
void ML_horizontal_line(int y, int x1, int x2, ML_Color color) {
int i;
char checker;
char* vram = ML_vram_adress();
if (y & ~63 || (x1 < 0 && x2 < 0) || (x1 > 127 && x2 > 127))
return;
if (x1 > x2) {
i = x1;
x1 = x2;
x2 = i;
}
if (x1 < 0)
x1 = 0;
if (x2 > 127)
x2 = 127;
switch (color) {
case ML_BLACK:
if (x1 >> 3 != x2 >> 3) {
vram[(y << 4) + (x1 >> 3)] |= 255 >> (x1 & 7);
vram[(y << 4) + (x2 >> 3)] |= 255 << 7 - (x2 & 7);
for (i = (x1 >> 3) + 1; i < x2 >> 3; i++)
vram[(y << 4) + i] = 255;
} else
vram[(y << 4) + (x1 >> 3)] |= (255 >> (x1 % 8 + 7 - x2 % 8)) << (7 - (x2 & 7));
break;
case ML_WHITE:
if (x1 >> 3 != x2 >> 3) {
vram[(y << 4) + (x1 >> 3)] &= 255 << 8 - (x1 & 7);
vram[(y << 4) + (x2 >> 3)] &= 255 >> 1 + (x2 & 7);
for (i = (x1 >> 3) + 1; i < x2 >> 3; i++)
vram[(y << 4) + i] = 0;
} else
vram[(y << 4) + (x1 >> 3)] &= (255 << 8 - (x1 & 7)) | (255 >> 1 + (x2 & 7));
break;
case ML_XOR:
if (x1 >> 3 != x2 >> 3) {
vram[(y << 4) + (x1 >> 3)] ^= 255 >> (x1 & 7);
vram[(y << 4) + (x2 >> 3)] ^= 255 << 7 - (x2 & 7);
for (i = (x1 >> 3) + 1; i < (x2 >> 3); i++)
vram[(y << 4) + i] ^= 255;
} else
vram[(y << 4) + (x1 >> 3)] ^= (255 >> ((x1 & 7) + 7 - (x2 & 7))) << (7 - (x2 & 7));
break;
case ML_CHECKER:
checker = (y & 1 ? 85 : 170);
if (x1 >> 3 != x2 >> 3) {
vram[(y << 4) + (x1 >> 3)] &= 255 << 8 - (x1 & 7);
vram[(y << 4) + (x2 >> 3)] &= 255 >> 1 + (x2 & 7);
vram[(y << 4) + (x1 >> 3)] |= checker & 255 >> (x1 & 7);
vram[(y << 4) + (x2 >> 3)] |= checker & 255 << 7 - (x2 & 7);
for (i = (x1 >> 3) + 1; i < x2 >> 3; i++)
vram[(y << 4) + i] = checker;
} else {
vram[(y << 4) + (x1 >> 3)] &= (255 << 8 - (x1 & 7)) | (255 >> 1 + (x2 & 7));
vram[(y << 4) + (x1 >> 3)] |= checker & (255 >> (x1 % 8 + 7 - x2 % 8)) << (7 - (x2 & 7));
}
break;
}
}
void ML_vertical_line(int x, int y1, int y2, ML_Color color) {
int i, j;
char checker, byte, * vram = ML_vram_adress();
if (x & ~127 || (y1 < 0 && y2 < 0) || (y1 > 63 && y2 > 63)) return;
if (y1 > y2) {
int tmp = y1;
y1 = y2;
y2 = tmp;
}
if (y1 < 0) y1 = 0;
if (y2 > 63) y2 = 63;
i = (y1 << 4) + (x >> 3);
j = (y2 << 4) + (x >> 3);
switch (color) {
case ML_BLACK:
byte = 128 >> (x & 7);
for (; i <= j; i += 16)
vram[i] |= byte;
break;
case ML_WHITE:
byte = ~(128 >> (x & 7));
for (; i <= j; i += 16)
vram[i] &= byte;
break;
case ML_XOR:
byte = 128 >> (x & 7);
for (; i <= j; i += 16)
vram[i] ^= byte;
break;
case ML_CHECKER:
byte = 128 >> (x & 7);
checker = y1 & 1 ^ x & 1;
for (; i <= j; i += 16) {
if (checker) vram[i] &= ~byte;
else vram[i] |= byte;
checker = !checker;
}
break;
}
}
void ML_pixel(int x, int y, ML_Color color) {
char* vram = ML_vram_adress();
if (x & ~127 || y & ~63) return;
switch (color) {
case ML_BLACK:
vram[(y << 4) + (x >> 3)] |= 128 >> (x & 7);
break;
case ML_WHITE:
vram[(y << 4) + (x >> 3)] &= ~(128 >> (x & 7));
break;
case ML_XOR:
vram[(y << 4) + (x >> 3)] ^= 128 >> (x & 7);
break;
case ML_CHECKER:
if (y & 1 ^ x & 1) vram[(y << 4) + (x >> 3)] &= ~(128 >> (x & 7));
else vram[(y << 4) + (x >> 3)] |= 128 >> (x & 7);
break;
}
}
double divByPow(double n, double x, int p) { //Divide OR Times n by x, p times (n / x^p): used for numbers bigger than 2^32 (int limit)
if (p < 0)
for (; p < 0; p++)
n *= x;
else
for (; p > 0; p--)
n /= x;
return n;
}
void stop(void) { //stops drawing set if user presses [EXIT] or [MENU]
if (Bkey_GetKeyWait(&kcode1, &kcode2, 1, 0, 1, &unused))
if (kcode1 == 4 && (kcode2 == 8 || kcode2 == 9)) {
dispX = 128; //Very hacky stop function
dispY = 64;
}
}
int AddIn_main(int isAppli, unsigned short OptionNum) { //Main function
unsigned int graphZoom = 1; //zoom level for graph
char screenZoom; //zoom level on screen (rectangle)
int screenX1, screenX2; //corner X cords for drawing rectangle to screen
int screenY1, screenY2; //corner Y cords for drawing rectangle to screen
unsigned char string[1]; //Used in converting int/double to char
char HUD = TRUE; //Heads Up Display: Cords, Zoom level & Max iteration: toggle with [F1]
char colour = ML_XOR; //Colour of rectangle: Black, White or Inverted
int screenX, screenY; //offset cords on screen from 0,0 for rectangle
double graphX = 0, graphY = 0; //cords on graph - where to center mandelbrot
double graphMove; //amount graphX & Y changes by when moving rectangle around
int screenMove; //amount screenX & Y changes by when moving rectangle around with arrow keys
short tempPixel = 0; //Write pixels to temp variable then write the entire 2bytes to VRAM all at once
register double zr, zi; //zr is real, zi imaginary
register double zr2, zi2; //zr2 = zr^2, zi2 = zi^2
register double x1 = -2.0; //bounding box cords on graph
register double x2 = 2.0; //bounding box cords on graph
register double y1 = -1.0; //bounding box cords on graph
register double y2 = 1.0; //bounding box cords on graph
register double x, y; //pixel cords on graph tested if in set
register double xIsz, yIsz; //amount x/y increases by when ploting graph
register unsigned short iMax = 32; //max iterations
register unsigned short i; //iterations
while (TRUE) {
register char* vram = ML_vram_adress();
SetTimer(1, 200, stop);
ML_clear_vram();
ML_display_vram();
xIsz = (x2 - x1) / 128;
yIsz = (y2 - y1) / 64;
y = y1;
for (dispY = 0; dispY < 64; dispY++) {
x = x1;
y += yIsz;
for (dispX = 0; dispX < 128; dispX++) {
zr = x;
zi = y;
for (i = 0; i < iMax; i++) {
zr2 = zr * zr;
zi2 = zi * zi;
if (zr2 + zi2 > 4)
break;
zi = zr * zi;
zi += zi + y;
zr = zr2 - zi2 + x;
}
tempPixel = (tempPixel << 1) | (i == iMax);
if ((dispX & 7) == 7)
*vram++ = tempPixel;
x += xIsz;
}
ML_display_vram_row(dispY);
}
SaveDisp(1);
KillTimer(1);
screenX = 0;
screenY = 0;
screenZoom = 1;
Bkey_GetKeyWait(&kcode1, &kcode2, 2, 1, 1, &unused);
do {
GetKey(&key);
screenMove = screenZoom > 4 ? 1 : divByPow(16, 2, screenZoom);
graphMove = screenZoom > 4 ? divByPow(1, 2, graphZoom - (double)screenZoom) : divByPow(16, 2, graphZoom);
switch (key) {
case KEY_CHAR_PLUS:
if (graphZoom < 51) {
graphZoom++;
screenZoom++;
}
break;
case KEY_CHAR_MINUS:
if (graphZoom) {
graphZoom--;
screenZoom--;
}
break;
case KEY_CTRL_UP:
screenY -= screenMove;
graphY -= graphMove;
break;
case KEY_CTRL_DOWN:
screenY += screenMove;
graphY += graphMove;
break;
case KEY_CTRL_LEFT:
screenX -= screenMove;
graphX -= graphMove;
break;
case KEY_CTRL_RIGHT:
screenX += screenMove;
graphX += graphMove;
break;
case KEY_CTRL_F1:
HUD = !HUD;
break;
case KEY_CTRL_F2:
if (colour)
colour--;
else
colour = ML_XOR;
break;
case KEY_CTRL_F3:
//Gray scale, by refreshing screen multiple times per sec at different max iterations (iMax)
break;
case KEY_CTRL_AC:
graphZoom = 1;
graphX = 0;
graphY = 0;
screenZoom = 1;
screenX = 0;
screenY = 0;
key = KEY_CTRL_EXE;
break;
}
RestoreDisp(1);
iMax = 8 * (graphZoom + 3);
if (screenZoom < 8) {
screenX1 = 65 - divByPow(128, 2, screenZoom) + screenX;
screenX2 = 62 + divByPow(128, 2, screenZoom) + screenX;
screenY1 = 32 - (screenZoom > 6 ? 1 : divByPow(64, 2, screenZoom)) + screenY;
screenY2 = 31 + (screenZoom > 6 ? 0 : divByPow(64, 2, screenZoom)) + screenY;
ML_horizontal_line(screenY1, screenX1, screenX2, colour);
ML_horizontal_line(screenY2, screenX1, screenX2, colour);
ML_vertical_line(screenX1 - 1, screenY1, screenY2, colour);
ML_vertical_line(screenX2 + 1, screenY1, screenY2, colour);
} else
ML_pixel(screenX + 64, screenY + 31, colour);
x1 = divByPow(-4, 2, graphZoom) + (0.03125 * graphX);
x2 = divByPow(4, 2, graphZoom) + (0.03125 * graphX);
y1 = divByPow(-2, 2, graphZoom) + (0.03125 * graphY);
y2 = divByPow(2, 2, graphZoom) + (0.03125 * graphY);
if (HUD == TRUE) {
sprintf(&string, "X1:%f", x1);
PrintMini(0, 0, string, 0);
sprintf(&string, "Y1:%f", y1);
PrintMini(0, 6, string, 0);
sprintf(&string, "X2:%f", x2);
PrintMini(81, 53, string, 0);
sprintf(&string, "Y2:%f", y2);
PrintMini(81, 59, string, 0);
sprintf(&string, "MaxI:%u", iMax);
PrintMini(0, 53, string, 0);
if (graphZoom > 32)
sprintf(&string, "Zoom:2^%ux", graphZoom - 1);
else
sprintf(&string, "Zoom:%ux", (int)divByPow(1, 2, -graphZoom + 1));
PrintMini(0, 59, string, 0);
}
ML_display_vram();
} while (key != KEY_CTRL_EXE);
}
return 0;
}
#pragma section _BR_Size
unsigned long BR_Size;
#pragma section
#pragma section _TOP
int InitializeSystem(int isAppli, unsigned short OptionNum) {
return INIT_ADDIN_APPLICATION(isAppli, OptionNum);
}
#pragma section
Fichier joint
Citer : Posté le 16/11/2019 03:46 | # |
Fichier joint
oops
I removed it in code, but forgot to recompile
Once I have got the 64bit Fixed Point assembly code all working
I'll start adding more features
- Gray Scale
- Higher zoom level
- Faster rendering
- Customizable Iterations
- Customizable HUD
- Julia Set
MANDEL.G1A
Mandelbrot SNKEmini Minesweeper Sudoku
Statut : Invité
Citer : Posté le 16/11/2019 04:08 | #
Wow! I eagerly wait!
Citer : Posté le 16/11/2019 08:54 | #
Wow. I tried that with full overclock, that's some stunning results right there! Deep full-screen images take about 10s and the overall fractal is drawn in about 1s!
I know it would be slower on fx-CG 50 due to the larger screen (9× more pixels to draw), but the colors, ah... x)
Citer : Posté le 16/11/2019 10:10 | #
Im not sure how to sign format the numbers
0x07000000 07000000 converted to negative becomes 0xF8FFFFFF F9000000
but the negative output of the multiplier (or subtractors) is 0xF9000000 F9000000, this is because after inverting the number, it adds +1 to both limbs of the whole number, rather than just the lowest limb
I'm having huge problems with the inputs needing to be 'normal' negative numbers, but everything outputs the double +1 numbers
But if everything is kept postive (multiplier and adderr, no subtractors) its all fine
I'm confussed that other people online dont seem to have this problem, I wonder if it just doesn't matter, or they are using other methods that negate it
Mandelbrot SNKEmini Minesweeper Sudoku
Citer : Posté le 16/11/2019 11:00 | #
It might be useful to remember the negation identity: -x = ~x + 1.
This is true whatever the size of the integer is. Here you are dealing with 64-bits, but it works just as well as with 32-bit integers. The main change is that you need to use carry-aware instructions to propagate changes from one half to the other.
Just as with addc, there is a negc instruction which does what you need. In fact, the example usage of addc in the manual is exactly this operation.
Before: r0,r1 = 00000000,00000001
After: r0,r1 = ffffffff,ffffffff
clrt
negc r1, r1
negc r0, r0
Citer : Posté le 21/11/2019 08:33 | #
The reason why no one else had problems with negative numbers is because
1. They didn't bother, and their design wouldn't work with negatives
or 2. They detected if the number is negative and made the number postive then readded the sign at the end
so thats what I did
some half Optimized code
; r4, r5, r6, r7, r8, r9, 10, 11
; 12 * 34 = 10*30 + 2*30 + 10*4 + 2*4 = 300 + 60 + 40 + 8 = 408
;x1x0 * y1y0 = x1*y1 + x0*y1 + x1*y0 + x0*y0 =
;Decimal point is 8bits right, from the left 1:7:56 Sign:Int:Frac
mov.l r8, @-r15
mov.l r9, @-r15
mov.l r10, @-r15
mov.l r11, @-r15
mov.l #1, r1
mov.l #0, r10
mov.l #0, r11
cmp/ge r10, r4
bt _positiveX
negc r5, r5
negc r4, r4
xor r1, r11
_positiveX:
cmp/ge r10, r6
bt _positiveY
negc r7, r7
negc r6, r6
xor r1, r11
_positiveY:
mov.l #0, r1
dmulu.l r5, r7 ;x0 * y0
sts mach, r2
clrt
dmulu.l r5, r6 ;x0 * y1
sts macl, r8
addc r8, r2
sts mach, r8
addc r8, r1
movt r0
clrt
dmulu.l r4, r7 ;x1 * y0
sts macl, r8
addc r8, r2
sts mach, r8
addc r8, r1
addc r10, r0
clrt
dmulu.l r4, r6 ;x1 * y1
sts macl, r8
addc r8, r1
sts mach, r8
addc r8, r0
; XXXXXXXX YYYYYYYY ZZZZZZZZ WWWWWWWW
;XX XXXXXXYY YYYYYYZZ ZZZZZZWW WWWWWW
mov #8, r8 ;8 //Left
shld r8, r0 ; XXXXXX
mov r1, r5
shld r8, r1 ; YYYYYY
mov #-24, r8 ;8-32 //Right
shld r8, r5 ;YY
shld r8, r2 ;ZZ
add r5, r0
add r2, r1
cmp/eq r10, r11
bt _positive
negc r1, r1
negc r0, r0
_positive:
mov.l @(16,r15), r4
mov.l @(20,r15), r5
mov.l @(24,r15), r6
mov.l @(28,r15), r7
mov.l r0, @r4 ;High
mov.l r1, @r5 ;Mid
mov.l @r15+, r11
mov.l @r15+, r10
mov.l @r15+, r9
rts
mov.l @r15+, r8_mul64Optimized: ;mul64Optimized(x1, x0, y1, y0, &high, &mid, &low, &below)
; r4, r5, r6, r7, r8, r9, 10, 11
; 12 * 34 = 10*30 + 2*30 + 10*4 + 2*4 = 300 + 60 + 40 + 8 = 408
;x1x0 * y1y0 = x1*y1 + x0*y1 + x1*y0 + x0*y0 =
;Decimal point is 8bits right, from the left 1:7:56 Sign:Int:Frac
mov.l r8, @-r15
mov.l r9, @-r15
mov.l r10, @-r15
mov.l r11, @-r15
mov.l #1, r1
mov.l #0, r10
mov.l #0, r11
cmp/ge r10, r4
bt _positiveX
negc r5, r5
negc r4, r4
xor r1, r11
_positiveX:
cmp/ge r10, r6
bt _positiveY
negc r7, r7
negc r6, r6
xor r1, r11
_positiveY:
mov.l #0, r1
dmulu.l r5, r7 ;x0 * y0
sts mach, r2
clrt
dmulu.l r5, r6 ;x0 * y1
sts macl, r8
addc r8, r2
sts mach, r8
addc r8, r1
movt r0
clrt
dmulu.l r4, r7 ;x1 * y0
sts macl, r8
addc r8, r2
sts mach, r8
addc r8, r1
addc r10, r0
clrt
dmulu.l r4, r6 ;x1 * y1
sts macl, r8
addc r8, r1
sts mach, r8
addc r8, r0
; XXXXXXXX YYYYYYYY ZZZZZZZZ WWWWWWWW
;XX XXXXXXYY YYYYYYZZ ZZZZZZWW WWWWWW
mov #8, r8 ;8 //Left
shld r8, r0 ; XXXXXX
mov r1, r5
shld r8, r1 ; YYYYYY
mov #-24, r8 ;8-32 //Right
shld r8, r5 ;YY
shld r8, r2 ;ZZ
add r5, r0
add r2, r1
cmp/eq r10, r11
bt _positive
negc r1, r1
negc r0, r0
_positive:
mov.l @(16,r15), r4
mov.l @(20,r15), r5
mov.l @(24,r15), r6
mov.l @(28,r15), r7
mov.l r0, @r4 ;High
mov.l r1, @r5 ;Mid
mov.l @r15+, r11
mov.l @r15+, r10
mov.l @r15+, r9
mov.l @r15+, r8
rts
nop
some optimized and very unreadable code
; r4, r5, r6, r7, r8, r9, 10, 11
; 12 * 34 = 10*30 + 2*30 + 10*4 + 2*4 = 300 + 60 + 40 + 8 = 408
;x1x0 * y1y0 = x1*y1 + x0*y1 + x1*y0 + x0*y0 =
;Decimal point is 8bits right, from the left 1:7:56 Sign:Int:Frac
mov.l r11, @-r15
mov.l #0, r0
mov.l #0, r11
cmp/ge r11, r4
bt _positiveX
;wasted cycle
negc r5, r5
negc r4, r4
xor #1, r0
_positiveX:
cmp/ge r11, r6
bt/s _positiveY
mov.l r10, @-r15
negc r7, r7
negc r6, r6
xor #1, r0
_positiveY:
dmulu.l r5, r7 ;x0 * y0
mov.l r9, @-r15
mov.l r8, @-r15
mov.l #0, r1
sts mach, r2
dmulu.l r5, r6 ;x0 * y1
mov r0, r10
clrt
;wasted cycle
sts macl, r8
addc r8, r2
sts mach, r8
dmulu.l r4, r7 ;x1 * y0
addc r8, r1
movt r0
clrt
sts macl, r8
addc r8, r2
sts mach, r8
dmulu.l r4, r6 ;x1 * y1
addc r8, r1
addc r11, r0
clrt
sts macl, r8
addc r8, r1
sts mach, r8
addc r8, r0
; XXXXXXXX YYYYYYYY ZZZZZZZZ WWWWWWWW
;XX XXXXXXYY YYYYYYZZ ZZZZZZWW WWWWWW
mov #8, r8 ;8 //Left
shld r8, r0 ; XXXXXX
mov r1, r5
shld r8, r1 ; YYYYYY
mov #-24, r8 ;8-32 //Right
shld r8, r5 ;YY
shld r8, r2 ;ZZ
add r5, r0
add r2, r1
cmp/eq r11, r10
bt/s _positive
mov.l @(16,r15), r4
negc r1, r1
negc r0, r0
_positive:
mov.l @(20,r15), r5
mov.l r0, @r4 ;High
mov.l r1, @r5 ;Mid
mov.l @r15+, r8
mov.l @r15+, r9
mov.l @r15+, r10
rts
mov.l @r15+, r11
Mandelbrot SNKEmini Minesweeper Sudoku
Citer : Posté le 21/11/2019 09:35 | #
Looks nice! I wonder if this could be shorter. Have you looked at the libgcc implementation? Here is how they multiply unsigned 64-bit values (where you don't have carry problems):
.h and .l represent the high and low halves of a 64-bit value
X is 2^32 (also looks like a polynomial)
(Xr4+r5)(Xr6+r7) = r5r7 + X(r4r7 + r5r6) + X^2(r4r6)
= r5r7 + X(r4r7 + r5r6) (X^2 overflows)
= r5r7 + X(r4r7.l + r5r6.l) (X * rxry.h overflows)
Output is:
r0 = r4r7.l + r5r6.l + r5r7.h (higher half)
r1 = r5r7.l (lower half)
dmulu.l r5,r7
sts macl,r1 # r1 = r5r7.l
sts mach,r2 # r2 = r5r7.h
mul.l r6,r5
sts macl,r0 # r0 = r5r6.l
mul.l r7,r4
add r2,r0 # r0 = r5r6.l + r5r7.h
sts macl,r2 # r2 = r4r7.l
rts
add r2,r0 # r0 = r4r7.l + r5r6.l + r5r7.h
I'm mainly mentioning this because of the "analysis" of the multiplication, which allows using 32-bit multiplications at times and has no carry. If you make both operands unsigned at the beginning then add the sign at the very end, maybe you can gain on these clrt and adds everywhere?
Anyway, this already looks really good! The performance of the fixed-point version is clearly very fast!
Citer : Posté le 21/11/2019 09:48 | #
That 64x64 only gives the lower half of the 128bit output
But I need the high half because the fixed point is very high up
That why it has normal mul.l and not dmulu.l
I had a feeling that I didn't need any of those carries, I had them there when testing signed multiplication
Mandelbrot SNKEmini Minesweeper Sudoku