Seuls les membres ayant 30 points peuvent parler sur le chat.

Forum Casio - Projets de programmation


Index du Forum » Projets de programmation » Mandelbrot Generator
RedcmdHors ligneMembrePoints: 201 Défis: 5 Message

Mandelbrot Generator

Posté le 15/06/2019 10:38

Hello allô

Default 1x zoom takes 7sec
Max zoom takes around 5-10min
It has a max zoom of 2^50: over one Quadrillion!
Going over 2^48 can be rather buggy
This is because numbers are limited to the 8 byte double variables

Attached file is both SH4 and SH3 compatible: MANDEL.G1A

This does need the 'MonochromeLib' libs the code comes with it now

Controls
[-] Zoom out
[+] Zoom in
[F1] Hide/show HUD which contains Cords, Zoom level and Max Iterations. (Heads Up Display)
[F2] Changes colours of camera rectangle: Black, White & Inverted
[AC] Resets screen back to default state
[EXE] Draw set
[EXIT] Stop drawing the Mandelbrot (If it's taking too long)
[MENU] Return to the menu screen
[REPLAY] Move camera rectangle around (Arrow Keys: [LEFT], [RIGHT], [UP], [DOWN])

How can I optimize this code to run faster or zoom in further?

#include "fxlib.h"
#include "stdio.h"

#define TRUE 1
#define FALSE 0

#define ML_vram_adress (*(sc_cpv)sc0135)

typedef enum { ML_TRANSPARENT = -1, ML_WHITE, ML_BLACK, ML_XOR, ML_CHECKER } ML_Color;
typedef char* (*sc_cpv)(void);
const unsigned int sc0135[] = { 0xD201D002, 0x422B0009, 0x80010070, 0x0135 };

unsigned int key;                //pause until key press
int kcode1, kcode2;                //row & col keycode for Bkey_GetKeyWait()
char unused;                    //unused (cause CASIO dumb dumb)
unsigned short dispX, dispY;    //cords on display when drawing mandelbrot

void ML_clear_vram() {
    int i, end, * pointer_long, vram;
    char* pointer_byte;
    vram = (int)ML_vram_adress();
    end = 4 - vram & 3;
    pointer_byte = (char*)vram;
    for (i = 0; i < end; i++) pointer_byte[i] = 0;
    pointer_long = (int*)(vram + end);
    for (i = 0; i < 255; i++) pointer_long[i] = 0;
    pointer_byte += 1020 + end;
    end = vram & 3;
    for (i = 0; i < end; i++) pointer_byte[i] = 0;
}

void ML_display_vram() {
    char* LCD_register_selector = (char*)0xB4000000, * LCD_data_register = (char*)0xB4010000, * vram;
    int i, j;
    vram = ML_vram_adress();
    for (i = 0; i < 64; i++) {
        *LCD_register_selector = 4;
        *LCD_data_register = i | 192;
        *LCD_register_selector = 4;
        *LCD_data_register = 0;
        *LCD_register_selector = 7;
        for (j = 0; j < 16; j++)
            *LCD_data_register = *vram++;
    }
}

void ML_display_vram_row(int row) {            //faster than ML_display_vram() which displays the entire screen instead of a single row
    unsigned char i;
    char* LCD_register_selector = (char*)0xB4000000, *LCD_data_register = (char*)0xB4010000, *vram;
    vram = (row << 4) + ML_vram_adress();
    *LCD_register_selector = 4;
    *LCD_data_register = row | 192;
    *LCD_register_selector = 4;
    *LCD_data_register = 0;
    *LCD_register_selector = 7;
    for (i = 0; i < 16; i++)
        * LCD_data_register = *vram++;
}

void ML_horizontal_line(int y, int x1, int x2, ML_Color color) {
    int i;
    char checker;
    char* vram = ML_vram_adress();
    if (y & ~63 || (x1 < 0 && x2 < 0) || (x1 > 127 && x2 > 127))
        return;
    if (x1 > x2) {
        i = x1;
        x1 = x2;
        x2 = i;
    }
    if (x1 < 0)
        x1 = 0;
    if (x2 > 127)
        x2 = 127;
    switch (color) {
        case ML_BLACK:
            if (x1 >> 3 != x2 >> 3) {
                vram[(y << 4) + (x1 >> 3)] |= 255 >> (x1 & 7);
                vram[(y << 4) + (x2 >> 3)] |= 255 << 7 - (x2 & 7);
                for (i = (x1 >> 3) + 1; i < x2 >> 3; i++)
                    vram[(y << 4) + i] = 255;
            } else
                vram[(y << 4) + (x1 >> 3)] |= (255 >> (x1 % 8 + 7 - x2 % 8)) << (7 - (x2 & 7));
            break;
        case ML_WHITE:
            if (x1 >> 3 != x2 >> 3) {
                vram[(y << 4) + (x1 >> 3)] &= 255 << 8 - (x1 & 7);
                vram[(y << 4) + (x2 >> 3)] &= 255 >> 1 + (x2 & 7);
                for (i = (x1 >> 3) + 1; i < x2 >> 3; i++)
                    vram[(y << 4) + i] = 0;
            } else
                vram[(y << 4) + (x1 >> 3)] &= (255 << 8 - (x1 & 7)) | (255 >> 1 + (x2 & 7));
            break;
        case ML_XOR:
            if (x1 >> 3 != x2 >> 3) {
                vram[(y << 4) + (x1 >> 3)] ^= 255 >> (x1 & 7);
                vram[(y << 4) + (x2 >> 3)] ^= 255 << 7 - (x2 & 7);
                for (i = (x1 >> 3) + 1; i < (x2 >> 3); i++)
                    vram[(y << 4) + i] ^= 255;
            } else
                vram[(y << 4) + (x1 >> 3)] ^= (255 >> ((x1 & 7) + 7 - (x2 & 7))) << (7 - (x2 & 7));
            break;
        case ML_CHECKER:
            checker = (y & 1 ? 85 : 170);
            if (x1 >> 3 != x2 >> 3) {
                vram[(y << 4) + (x1 >> 3)] &= 255 << 8 - (x1 & 7);
                vram[(y << 4) + (x2 >> 3)] &= 255 >> 1 + (x2 & 7);
                vram[(y << 4) + (x1 >> 3)] |= checker & 255 >> (x1 & 7);
                vram[(y << 4) + (x2 >> 3)] |= checker & 255 << 7 - (x2 & 7);
                for (i = (x1 >> 3) + 1; i < x2 >> 3; i++)
                    vram[(y << 4) + i] = checker;
            } else {
                vram[(y << 4) + (x1 >> 3)] &= (255 << 8 - (x1 & 7)) | (255 >> 1 + (x2 & 7));
                vram[(y << 4) + (x1 >> 3)] |= checker & (255 >> (x1 % 8 + 7 - x2 % 8)) << (7 - (x2 & 7));
            }
            break;
    }
}

void ML_vertical_line(int x, int y1, int y2, ML_Color color) {
    int i, j;
    char checker, byte, * vram = ML_vram_adress();
    if (x & ~127 || (y1 < 0 && y2 < 0) || (y1 > 63 && y2 > 63)) return;
    if (y1 > y2) {
        int tmp = y1;
        y1 = y2;
        y2 = tmp;
    }
    if (y1 < 0) y1 = 0;
    if (y2 > 63) y2 = 63;

    i = (y1 << 4) + (x >> 3);
    j = (y2 << 4) + (x >> 3);
    switch (color) {
        case ML_BLACK:
            byte = 128 >> (x & 7);
            for (; i <= j; i += 16)
                vram[i] |= byte;
            break;
        case ML_WHITE:
            byte = ~(128 >> (x & 7));
            for (; i <= j; i += 16)
                vram[i] &= byte;
            break;
        case ML_XOR:
            byte = 128 >> (x & 7);
            for (; i <= j; i += 16)
                vram[i] ^= byte;
            break;
        case ML_CHECKER:
            byte = 128 >> (x & 7);
            checker = y1 & 1 ^ x & 1;
            for (; i <= j; i += 16) {
                if (checker) vram[i] &= ~byte;
                else vram[i] |= byte;
                checker = !checker;
            }
            break;
    }
}

void ML_pixel(int x, int y, ML_Color color) {
    char* vram = ML_vram_adress();
    if (x & ~127 || y & ~63) return;
    switch (color) {
        case ML_BLACK:
            vram[(y << 4) + (x >> 3)] |= 128 >> (x & 7);
            break;
        case ML_WHITE:
            vram[(y << 4) + (x >> 3)] &= ~(128 >> (x & 7));
            break;
        case ML_XOR:
            vram[(y << 4) + (x >> 3)] ^= 128 >> (x & 7);
            break;
        case ML_CHECKER:
            if (y & 1 ^ x & 1) vram[(y << 4) + (x >> 3)] &= ~(128 >> (x & 7));
            else vram[(y << 4) + (x >> 3)] |= 128 >> (x & 7);
            break;
    }
}

double divByPow(double n, double x, int p) {        //Divide OR Times n by x, p times (n / x^p): used for numbers bigger than 2^32 (int limit)
    if (p < 0)
        for (; p < 0; p++)
            n *= x;
    else
        for (; p > 0; p--)
            n /= x;
    return n;
}

void stop(void) {            //stops drawing set if user presses [EXIT] or [MENU]
    if (Bkey_GetKeyWait(&kcode1, &kcode2, 1, 0, 1, &unused))
        if (kcode1 == 4 && (kcode2 == 8 || kcode2 == 9)) {
            dispX = 128;    //Very hacky stop function
            dispY = 64;
        }
}

int AddIn_main(int isAppli, unsigned short OptionNum) {        //Main function
    unsigned int graphZoom = 1;                //zoom level for graph
    char screenZoom;                        //zoom level on screen (rectangle)
    int screenX1, screenX2;                    //corner X cords for drawing rectangle to screen
    int screenY1, screenY2;                    //corner Y cords for drawing rectangle to screen
    unsigned char string[1];                //Used in converting int/double to char
    char HUD = TRUE;                        //Heads Up Display: Cords, Zoom level & Max iteration: toggle with [F1]
    char colour = ML_XOR;                    //Colour of rectangle: Black, White or Inverted
    int screenX, screenY;                    //offset cords on screen from 0,0 for rectangle
    double graphX = 0, graphY = 0;            //cords on graph - where to center mandelbrot
    double graphMove;                        //amount graphX & Y changes by when moving rectangle around
    int screenMove;                            //amount screenX & Y changes by when moving rectangle around with arrow keys
    short tempPixel = 0;                    //Write pixels to temp variable then write the entire 2bytes to VRAM all at once

    register double zr, zi;                    //zr is real, zi imaginary
    register double zr2, zi2;                //zr2 = zr^2, zi2 = zi^2
    register double x1 = -2.0;                //bounding box cords on graph
    register double x2 = 2.0;                //bounding box cords on graph
    register double y1 = -1.0;                //bounding box cords on graph
    register double y2 = 1.0;                //bounding box cords on graph
    register double x, y;                    //pixel cords on graph tested if in set
    register double xIsz, yIsz;                //amount x/y increases by when ploting graph
    register unsigned short iMax = 32;        //max iterations
    register unsigned short i;                //iterations

    while (TRUE) {
        register char* vram = ML_vram_adress();

        SetTimer(1, 200, stop);
        ML_clear_vram();
        ML_display_vram();

        xIsz = (x2 - x1) / 128;
        yIsz = (y2 - y1) / 64;

        y = y1;
        for (dispY = 0; dispY < 64; dispY++) {
            x = x1;
            y += yIsz;
            for (dispX = 0; dispX < 128; dispX++) {
                zr = x;
                zi = y;
                for (i = 0; i < iMax; i++) {
                    zr2 = zr * zr;
                    zi2 = zi * zi;
                    if (zr2 + zi2 > 4)
                        break;
                    zi = zr * zi;
                    zi += zi + y;
                    zr = zr2 - zi2 + x;
                }
                tempPixel = (tempPixel << 1) | (i == iMax);
                if ((dispX & 7) == 7)
                    *vram++ = tempPixel;
                x += xIsz;
            }
            ML_display_vram_row(dispY);
        }
        SaveDisp(1);
        KillTimer(1);
        screenX = 0;
        screenY = 0;
        screenZoom = 1;
        Bkey_GetKeyWait(&kcode1, &kcode2, 2, 1, 1, &unused);
        do {
            GetKey(&key);
            screenMove = screenZoom > 4 ? 1 : divByPow(16, 2, screenZoom);
            graphMove = screenZoom > 4 ? divByPow(1, 2, graphZoom - (double)screenZoom) : divByPow(16, 2, graphZoom);
            switch (key) {
                case KEY_CHAR_PLUS:
                    if (graphZoom < 51) {
                        graphZoom++;
                        screenZoom++;
                    }
                    break;
                case KEY_CHAR_MINUS:
                    if (graphZoom) {
                        graphZoom--;
                        screenZoom--;
                    }
                    break;
                case KEY_CTRL_UP:
                    screenY -= screenMove;
                    graphY -= graphMove;
                    break;
                case KEY_CTRL_DOWN:
                    screenY += screenMove;
                    graphY += graphMove;
                    break;
                case KEY_CTRL_LEFT:
                    screenX -= screenMove;
                    graphX -= graphMove;
                    break;
                case KEY_CTRL_RIGHT:
                    screenX += screenMove;
                    graphX += graphMove;
                    break;
                case KEY_CTRL_F1:
                    HUD = !HUD;
                    break;
                case KEY_CTRL_F2:
                    if (colour)
                        colour--;
                    else
                        colour = ML_XOR;
                    break;
                case KEY_CTRL_F3:
                    //Gray scale, by refreshing screen multiple times per sec at different max iterations (iMax)
                    break;
                case KEY_CTRL_AC:
                    graphZoom = 1;
                    graphX = 0;
                    graphY = 0;
                    screenZoom = 1;
                    screenX = 0;
                    screenY = 0;
                    key = KEY_CTRL_EXE;
                    break;
            }
            RestoreDisp(1);
            iMax = 8 * (graphZoom + 3);

            if (screenZoom < 8) {
                screenX1 = 65 - divByPow(128, 2, screenZoom) + screenX;
                screenX2 = 62 + divByPow(128, 2, screenZoom) + screenX;
                screenY1 = 32 - (screenZoom > 6 ? 1 : divByPow(64, 2, screenZoom)) + screenY;
                screenY2 = 31 + (screenZoom > 6 ? 0 : divByPow(64, 2, screenZoom)) + screenY;
                ML_horizontal_line(screenY1, screenX1, screenX2, colour);
                ML_horizontal_line(screenY2, screenX1, screenX2, colour);
                ML_vertical_line(screenX1 - 1, screenY1, screenY2, colour);
                ML_vertical_line(screenX2 + 1, screenY1, screenY2, colour);
            } else
                ML_pixel(screenX + 64, screenY + 31, colour);

            x1 = divByPow(-4, 2, graphZoom) + (0.03125 * graphX);
            x2 = divByPow(4, 2, graphZoom) + (0.03125 * graphX);
            y1 = divByPow(-2, 2, graphZoom) + (0.03125 * graphY);
            y2 = divByPow(2, 2, graphZoom) + (0.03125 * graphY);

            if (HUD == TRUE) {
                sprintf(&string, "X1:%f", x1);
                PrintMini(0, 0, string, 0);
                sprintf(&string, "Y1:%f", y1);
                PrintMini(0, 6, string, 0);
                sprintf(&string, "X2:%f", x2);
                PrintMini(81, 53, string, 0);
                sprintf(&string, "Y2:%f", y2);
                PrintMini(81, 59, string, 0);
                sprintf(&string, "MaxI:%u", iMax);
                PrintMini(0, 53, string, 0);
                if (graphZoom > 32)
                    sprintf(&string, "Zoom:2^%ux", graphZoom - 1);
                else
                    sprintf(&string, "Zoom:%ux", (int)divByPow(1, 2, -graphZoom + 1));
                PrintMini(0, 59, string, 0);
            }

            ML_display_vram();

        } while (key != KEY_CTRL_EXE);
    }
    return 0;
}


#pragma section _BR_Size
unsigned long BR_Size;
#pragma section
#pragma section _TOP
int InitializeSystem(int isAppli, unsigned short OptionNum) {
    return INIT_ADDIN_APPLICATION(isAppli, OptionNum);
}
#pragma section


Fichier joint


Pages : Précédente1, 2, 3, 4, 5
RedcmdHors ligneMembrePoints: 201 Défis: 5 Message

Citer : Posté le 16/11/2019 03:46 | # | Fichier joint


oops
I removed it in code, but forgot to recompile

Once I have got the 64bit Fixed Point assembly code all working
I'll start adding more features
- Gray Scale
- Higher zoom level
- Faster rendering
- Customizable Iterations
- Customizable HUD
- Julia Set

MANDEL.G1A

RedCMD#4299 - Discord
Mandelbrot SNKEmini Minesweeper Sudoku
Mrvoxy
Statut : Invité

Citer : Posté le 16/11/2019 04:08 | #


Wow! I eagerly wait!
LephenixnoirEn ligneAdministrateurPoints: 16436 Défis: 140 Message

Citer : Posté le 16/11/2019 08:54 | #


Wow. I tried that with full overclock, that's some stunning results right there! Deep full-screen images take about 10s and the overall fractal is drawn in about 1s!

I know it would be slower on fx-CG 50 due to the larger screen (9× more pixels to draw), but the colors, ah... x)
RedcmdHors ligneMembrePoints: 201 Défis: 5 Message

Citer : Posté le 16/11/2019 10:10 | #


Im not sure how to sign format the numbers
0x07000000 07000000 converted to negative becomes 0xF8FFFFFF F9000000
but the negative output of the multiplier (or subtractors) is 0xF9000000 F9000000, this is because after inverting the number, it adds +1 to both limbs of the whole number, rather than just the lowest limb
I'm having huge problems with the inputs needing to be 'normal' negative numbers, but everything outputs the double +1 numbers
But if everything is kept postive (multiplier and adderr, no subtractors) its all fine
I'm confussed that other people online dont seem to have this problem, I wonder if it just doesn't matter, or they are using other methods that negate it
RedCMD#4299 - Discord
Mandelbrot SNKEmini Minesweeper Sudoku
LephenixnoirEn ligneAdministrateurPoints: 16436 Défis: 140 Message

Citer : Posté le 16/11/2019 11:00 | #


It might be useful to remember the negation identity: -x = ~x + 1.

This is true whatever the size of the integer is. Here you are dealing with 64-bits, but it works just as well as with 32-bit integers. The main change is that you need to use carry-aware instructions to propagate changes from one half to the other.

Just as with addc, there is a negc instruction which does what you need. In fact, the example usage of addc in the manual is exactly this operation.

Negating r0,r1
Before: r0,r1 = 00000000,00000001
After:  r0,r1 = ffffffff,ffffffff

clrt
negc r1, r1
negc r0, r0
RedcmdHors ligneMembrePoints: 201 Défis: 5 Message

Citer : Posté le 21/11/2019 08:33 | #


The reason why no one else had problems with negative numbers is because
1. They didn't bother, and their design wouldn't work with negatives
or 2. They detected if the number is negative and made the number postive then readded the sign at the end
so thats what I did

some half Optimized code
_mul64:                                 ;mul64(x1, x0, y1, y0, &high, &mid, &low, &below)
                                        ;       r4, r5, r6, r7,      r8,    r9,   10,      11
;  12 * 34      =     10*30 +  2*30 + 10*4  +  2*4   =  300 + 60 + 40 + 8  =     408
;x1x0 * y1y0  =     x1*y1 + x0*y1 + x1*y0 + x0*y0  =    

;Decimal point is 8bits right, from the left        1:7:56        Sign:Int:Frac

    mov.l    r8,         @-r15
    mov.l    r9,         @-r15
    mov.l    r10,        @-r15
    mov.l    r11,        @-r15
    mov.l    #1,         r1
    mov.l    #0,         r10
    mov.l    #0,            r11

    cmp/ge    r10,        r4
    bt        _positiveX
    negc    r5,            r5
    negc    r4,            r4
    xor        r1,            r11
_positiveX:

    cmp/ge    r10,        r6
    bt        _positiveY
    negc    r7,            r7
    negc    r6,            r6
    xor        r1,            r11
_positiveY:

    mov.l    #0,            r1

    dmulu.l r5,         r7        ;x0 * y0
    sts     mach,        r2

    clrt
    dmulu.l r5,         r6        ;x0 * y1
    sts     macl,        r8
    addc    r8,         r2
    sts     mach,        r8
    addc    r8,         r1
    movt    r0
    
    clrt
    dmulu.l r4,         r7        ;x1 * y0
    sts     macl,        r8
    addc    r8,         r2
    sts     mach,        r8
    addc    r8,         r1
    addc    r10,        r0

    clrt
    dmulu.l r4,         r6        ;x1 * y1
    sts     macl,        r8
    addc    r8,         r1
    sts     mach,        r8
    addc    r8,         r0
                                ;   XXXXXXXX YYYYYYYY ZZZZZZZZ WWWWWWWW
                                ;XX XXXXXXYY YYYYYYZZ ZZZZZZWW WWWWWW
    mov        #8,            r8        ;8            //Left
    shld    r8,         r0        ;    XXXXXX
    mov        r1,         r5
    shld    r8,         r1        ;    YYYYYY

    mov        #-24,        r8        ;8-32        //Right
    shld    r8,         r5        ;YY
    shld    r8,         r2        ;ZZ
    
    add        r5,            r0
    add        r2,            r1
    
    cmp/eq    r10,        r11
    bt        _positive
    negc    r1,            r1
    negc    r0,            r0
_positive:

    mov.l    @(16,r15),    r4
    mov.l    @(20,r15),    r5
    mov.l    @(24,r15),    r6
    mov.l    @(28,r15),    r7
    mov.l    r0,            @r4     ;High
    mov.l    r1,            @r5     ;Mid
    mov.l    @r15+,        r11
    mov.l    @r15+,        r10
    mov.l    @r15+,        r9
    rts
    mov.l    @r15+,        r8_mul64Optimized:                         ;mul64Optimized(x1, x0, y1, y0, &high, &mid, &low, &below)
                                        ;                r4, r5, r6, r7,       r8,     r9,   10,       11
;  12 * 34      =     10*30 +  2*30 + 10*4  +  2*4   =  300 + 60 + 40 + 8  =     408
;x1x0 * y1y0  =     x1*y1 + x0*y1 + x1*y0 + x0*y0  =    

;Decimal point is 8bits right, from the left        1:7:56        Sign:Int:Frac

    mov.l    r8,         @-r15
    mov.l    r9,         @-r15
    mov.l    r10,        @-r15
    mov.l    r11,        @-r15
    mov.l    #1,         r1
    mov.l    #0,         r10
    mov.l    #0,            r11

    cmp/ge    r10,        r4
    bt        _positiveX
    negc    r5,            r5
    negc    r4,            r4
    xor        r1,            r11
_positiveX:

    cmp/ge    r10,        r6
    bt        _positiveY
    negc    r7,            r7
    negc    r6,            r6
    xor        r1,            r11
_positiveY:

    mov.l    #0,            r1

    dmulu.l r5,         r7        ;x0 * y0
    sts     mach,        r2

    clrt
    dmulu.l r5,         r6        ;x0 * y1
    sts     macl,        r8
    addc    r8,         r2
    sts     mach,        r8
    addc    r8,         r1
    movt    r0
    
    clrt
    dmulu.l r4,         r7        ;x1 * y0
    sts     macl,        r8
    addc    r8,         r2
    sts     mach,        r8
    addc    r8,         r1
    addc    r10,        r0

    clrt
    dmulu.l r4,         r6        ;x1 * y1
    sts     macl,        r8
    addc    r8,         r1
    sts     mach,        r8
    addc    r8,         r0
                                ;   XXXXXXXX YYYYYYYY ZZZZZZZZ WWWWWWWW
                                ;XX XXXXXXYY YYYYYYZZ ZZZZZZWW WWWWWW
    mov        #8,            r8        ;8            //Left
    shld    r8,         r0        ;    XXXXXX
    mov        r1,         r5
    shld    r8,         r1        ;    YYYYYY

    mov        #-24,        r8        ;8-32        //Right
    shld    r8,         r5        ;YY
    shld    r8,         r2        ;ZZ
    
    add        r5,            r0
    add        r2,            r1
    
    cmp/eq    r10,        r11
    bt        _positive
    negc    r1,            r1
    negc    r0,            r0
_positive:

    mov.l    @(16,r15),    r4
    mov.l    @(20,r15),    r5
    mov.l    @(24,r15),    r6
    mov.l    @(28,r15),    r7
    mov.l    r0,            @r4     ;High
    mov.l    r1,            @r5     ;Mid
    mov.l    @r15+,        r11
    mov.l    @r15+,        r10
    mov.l    @r15+,        r9
    mov.l    @r15+,        r8
    rts
    nop


some optimized and very unreadable code
_mul64Optimized:                         ;mul64Optimized(x1, x0, y1, y0, &high, &mid, &low, &below)
                                        ;                r4, r5, r6, r7,       r8,     r9,   10,       11
;  12 * 34      =     10*30 +  2*30 + 10*4  +  2*4   =  300 + 60 + 40 + 8  =     408
;x1x0 * y1y0  =     x1*y1 + x0*y1 + x1*y0 + x0*y0  =    

;Decimal point is 8bits right, from the left        1:7:56        Sign:Int:Frac

    mov.l    r11,        @-r15
    mov.l    #0,            r0
    mov.l    #0,         r11

    cmp/ge    r11,        r4
    bt        _positiveX
    ;wasted cycle
    negc    r5,            r5
    negc    r4,            r4
    xor        #1,            r0
_positiveX:

    cmp/ge    r11,        r6
    bt/s    _positiveY
    mov.l    r10,        @-r15
    negc    r7,            r7
    negc    r6,            r6
    xor        #1,            r0
_positiveY:

    dmulu.l r5,         r7        ;x0 * y0
    mov.l    r9,         @-r15
    mov.l    r8,         @-r15
    mov.l    #0,            r1
    sts     mach,        r2

    dmulu.l r5,         r6        ;x0 * y1
    mov        r0,            r10
    clrt
    ;wasted cycle
    sts     macl,        r8
    addc    r8,         r2
    sts     mach,        r8
    
    dmulu.l r4,         r7        ;x1 * y0
    addc    r8,         r1
    movt    r0
    clrt
    sts     macl,        r8
    addc    r8,         r2
    sts     mach,        r8

    dmulu.l r4,         r6        ;x1 * y1
    addc    r8,         r1
    addc    r11,        r0
    clrt
    sts     macl,        r8
    addc    r8,         r1
    sts     mach,        r8
    addc    r8,         r0
                                ;   XXXXXXXX YYYYYYYY ZZZZZZZZ WWWWWWWW
                                ;XX XXXXXXYY YYYYYYZZ ZZZZZZWW WWWWWW
    mov        #8,            r8        ;8            //Left
    shld    r8,         r0        ;    XXXXXX
    mov        r1,         r5
    shld    r8,         r1        ;    YYYYYY

    mov        #-24,        r8        ;8-32        //Right
    shld    r8,         r5        ;YY
    shld    r8,         r2        ;ZZ
    
    add        r5,            r0
    add        r2,            r1
    
    cmp/eq    r11,        r10
    bt/s    _positive
    mov.l    @(16,r15),    r4
    negc    r1,            r1
    negc    r0,            r0
_positive:

    mov.l    @(20,r15),    r5
    mov.l    r0,            @r4     ;High
    mov.l    r1,            @r5     ;Mid
    mov.l    @r15+,        r8
    mov.l    @r15+,        r9
    mov.l    @r15+,        r10
    rts
    mov.l    @r15+,        r11

RedCMD#4299 - Discord
Mandelbrot SNKEmini Minesweeper Sudoku
LephenixnoirEn ligneAdministrateurPoints: 16436 Défis: 140 Message

Citer : Posté le 21/11/2019 09:35 | #


Looks nice! I wonder if this could be shorter. Have you looked at the libgcc implementation? Here is how they multiply unsigned 64-bit values (where you don't have carry problems):

Notation: All product rxry must be seen as 64-bit
          .h and .l represent the high and low halves of a 64-bit value
          X is 2^32 (also looks like a polynomial)

(Xr4+r5)(Xr6+r7) = r5r7 + X(r4r7 + r5r6) + X^2(r4r6)
                 = r5r7 + X(r4r7 + r5r6)                (X^2 overflows)
                 = r5r7 + X(r4r7.l + r5r6.l)            (X * rxry.h overflows)
Output is:
  r0 = r4r7.l + r5r6.l + r5r7.h    (higher half)
  r1 = r5r7.l                      (lower half)

dmulu.l    r5,r7
sts    macl,r1        # r1 = r5r7.l
sts    mach,r2         # r2 = r5r7.h
mul.l    r6,r5
sts    macl,r0         # r0 = r5r6.l
mul.l    r7,r4
add    r2,r0           # r0 = r5r6.l + r5r7.h
sts    macl,r2         # r2 = r4r7.l
rts    
add    r2,r0           # r0 = r4r7.l + r5r6.l + r5r7.h

I'm mainly mentioning this because of the "analysis" of the multiplication, which allows using 32-bit multiplications at times and has no carry. If you make both operands unsigned at the beginning then add the sign at the very end, maybe you can gain on these clrt and adds everywhere?

Anyway, this already looks really good! The performance of the fixed-point version is clearly very fast!
RedcmdHors ligneMembrePoints: 201 Défis: 5 Message

Citer : Posté le 21/11/2019 09:48 | #


That 64x64 only gives the lower half of the 128bit output
But I need the high half because the fixed point is very high up
That why it has normal mul.l and not dmulu.l

I had a feeling that I didn't need any of those carries, I had them there when testing signed multiplication
RedCMD#4299 - Discord
Mandelbrot SNKEmini Minesweeper Sudoku
Pages : Précédente1, 2, 3, 4, 5

Planète Casio v42 © créé par Neuronix et Muelsaco 2004 - 2019 | Il y a 171 connectés | Nous contacter | Qui sommes-nous ? | Licences et remerciements

Planète Casio est un site communautaire non affilié à Casio. Toute reproduction de Planète Casio, même partielle, est interdite.
Les programmes et autres publications présentes sur Planète Casio restent la propriété de leurs auteurs et peuvent être soumis à des licences ou copyrights.
CASIO est une marque déposée par CASIO Computer Co., Ltd