CRC32 with "nibble" table

strik · Post by **strik** » Mon Jun 09, 2025 7:27 pm

Hello,

I have a problem where I need a CRC32 on a memory-constraint 6502 system (no 65C02 or other allowed, just plain NMOS-6502). One the one hand, it should be somehow fast (which makes http://6502.org/source/integers/crc.htm not optimal), but also not to take too much space (LUT normally take 256 * 4 = 1024 Byte, for example at http://6502.org/source/integers/crc.htm).

I searched for algorithms that use smaller LUT, but are still faster than doing the bit-banging one by one.

I found that one here from https://create.stephan-brumme.com/crc32/Crc32.cpp:

Code: Select all

/// compute CRC32 (half-byte algoritm)
uint32_t crc32_halfbyte(const void* data, size_t length, uint32_t previousCrc32)
{
  uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF
  const uint8_t* current = (const uint8_t*) data;

  /// look-up table for half-byte, same as crc32Lookup[0][16*i]
  static const uint32_t Crc32Lookup16[16] =
  {
    0x00000000,0x1DB71064,0x3B6E20C8,0x26D930AC,0x76DC4190,0x6B6B51F4,0x4DB26158,0x5005713C,
    0xEDB88320,0xF00F9344,0xD6D6A3E8,0xCB61B38C,0x9B64C2B0,0x86D3D2D4,0xA00AE278,0xBDBDF21C
  };

  while (length-- != 0)
  {
    crc = Crc32Lookup16[(crc ^  *current      ) & 0x0F] ^ (crc >> 4);
    crc = Crc32Lookup16[(crc ^ (*current >> 4)) & 0x0F] ^ (crc >> 4);
    current++;
  }

  return ~crc; // same as crc ^ 0xFFFFFFFF
}

androSID/fhw72 from the german Forum-64 showed me an even more optimized version:

Code: Select all

uint32_t calcCRC32(uint32_t crc, const uint8_t* buffer, uint32_t size)
{
   // Nibble table for 0xEDB88320 polynomial
   static const uint32_t crcTable[] =
   {
       0x00000000, 0x1DB71064, 0x3B6E20C8, 0x26D930AC, 0x76DC4190, 0x6B6B51F4, 0x4DB26158, 0x5005713C,
       0xEDB88320, 0xF00F9344, 0xD6D6A3E8, 0xCB61B38C, 0x9B64C2B0, 0x86D3D2D4, 0xA00AE278, 0xBDBDF21C
   };
   
   while (size--)
   {
       crc ^= *buffer++;
       crc = (crc >> 4) ^ crcTable[crc & 0x0F]; // 1st nibble
       crc = (crc >> 4) ^ crcTable[crc & 0x0F]; // 2nd nibble
   }

   return crc;
}

// Full buffer with XOR inout
uint32_t crc = ~calcCRC32(0xFFFFFFFF, buffer, sizeof(buffer));

I tried to implement it in 6502 assembler, and I think I found a not-so-bad solution:

Code: Select all

CRC0   = $FB            ; current value of CRC
;CRC0   = $2000          ; current value of CRC
CRC1   = CRC0 + 1       ; not necessarily contiguous
CRC2   = CRC1 + 1       ; not necessarily contiguous
CRC3   = CRC2 + 1       ; not necessarily contiguous

CRC_START = $A000
CRC_COUNT = $2000

.macro CRC_createTableEntry value,byte,word
    .byte byte(word(value))
.endmacro

.macro CRC_createTable byte,word
    CRC_createTableEntry $00000000, byte, word
    CRC_createTableEntry $1DB71064, byte, word
    CRC_createTableEntry $3B6E20C8, byte, word
    CRC_createTableEntry $26D930AC, byte, word
    CRC_createTableEntry $76DC4190, byte, word
    CRC_createTableEntry $6B6B51F4, byte, word
    CRC_createTableEntry $4DB26158, byte, word
    CRC_createTableEntry $5005713C, byte, word
    CRC_createTableEntry $EDB88320, byte, word
    CRC_createTableEntry $F00F9344, byte, word
    CRC_createTableEntry $D6D6A3E8, byte, word
    CRC_createTableEntry $CB61B38C, byte, word
    CRC_createTableEntry $9B64C2B0, byte, word
    CRC_createTableEntry $86D3D2D4, byte, word
    CRC_createTableEntry $A00AE278, byte, word
    CRC_createTableEntry $BDBDF21C, byte, word
.endmacro

crc32_table0:
    CRC_createTable .lobyte, .loword

crc32_table1:
    CRC_createTable .hibyte, .loword

crc32_table2:
    CRC_createTable .lobyte, .hiword

crc32_table3:
    CRC_createTable .hibyte, .hiword

Run:

        lda #>CRC_START
        ldx #>CRC_COUNT

CalcCRC:
        ; A = Start address high
        ; X = count
        sta readvalue+2
        stx count

        ; init CRC with $FFFF FFFF
        lda #$ff
        sta CRC0
        sta CRC1
        sta CRC2
        sta CRC3

CalcCRC_Loop:

readvalue:
        lda $FF00        ; dummy address for value
        eor CRC0
        sta CRC0

        and #$0f
        tax

        ; shift CRC 4 times to the right: (crc >> 4)
        lda CRC0

        lsr CRC3
        ror CRC2
        ror CRC1
        ror
        lsr CRC3
        ror CRC2
        ror CRC1
        ror
        lsr CRC3
        ror CRC2
        ror CRC1
        ror
        lsr CRC3
        ror CRC2
        ror CRC1
        ror
        sta CRC0

        lda crc32_table0,x
        eor CRC0
        sta CRC0
        lda crc32_table1,x
        eor CRC1
        sta CRC1
        lda crc32_table2,x
        eor CRC2
        sta CRC2
        lda crc32_table3,x
        eor CRC3
        sta CRC3

        lda CRC0
        and #$0f
        tax

        ; shift CRC 4 times to the right: (crc >> 4)
        lda CRC0

        lsr CRC3
        ror CRC2
        ror CRC1
        ror
        lsr CRC3
        ror CRC2
        ror CRC1
        ror
        lsr CRC3
        ror CRC2
        ror CRC1
        ror
        lsr CRC3
        ror CRC2
        ror CRC1
        ror

        sta CRC0


        lda crc32_table0,x
        eor CRC0
        sta CRC0
        lda crc32_table1,x
        eor CRC1
        sta CRC1
        lda crc32_table2,x
        eor CRC2
        sta CRC2
        lda crc32_table3,x
        eor CRC3
        sta CRC3

        ; CalcCRC_Loop is too far away, so check indirectly
        inc readvalue+1
        beq @not_high
        jmp CalcCRC_Loop

@not_high:
        inc readvalue+2
        dec count
        beq @done
        jmp CalcCRC_Loop

@done:
        ; invert the result
        lda CRC0
        eor #$ff
        sta CRC0
        lda CRC1
        eor #$ff
        sta CRC1
        lda CRC2
        eor #$ff
        sta CRC2
        lda CRC3
        eor #$ff
        sta CRC3
        rts

count:
        .byte 0

Note the macro-mockery at startup. This is ca65 syntax. I create 4 tables, crc32_table0, crc32_table1, crc32_table2, and crc32_table3, that can be accessed indexed for the LUT values.

crc32_table0 takes the lowest byte of each LUT entry, crc32_table1 the next significant byte, until crc32_table3, which takes the most significant byte of the LUT entry.

I put the LUT at the beginning, so that I am sure that it does not cross a page bondary.

The result of it is:

Code: Select all

000000r 1               crc32_table0:
000000r 1  00 64 C8 AC      CRC_createTable .lobyte, .loword
000004r 1  90 F4 58 3C  
000008r 1  20 44 E8 8C  
00000Cr 1  B0 D4 78 1C  
000010r 1               
000010r 1               crc32_table1:
000010r 1  00 10 20 30      CRC_createTable .hibyte, .loword
000014r 1  41 51 61 71  
000018r 1  83 93 A3 B3  
00001Cr 1  C2 D2 E2 F2  
000020r 1               
000020r 1               crc32_table2:
000020r 1  00 B7 6E D9      CRC_createTable .lobyte, .hiword
000024r 1  DC 6B B2 05  
000028r 1  B8 0F D6 61  
00002Cr 1  64 D3 0A BD  
000030r 1               
000030r 1               crc32_table3:
000030r 1  00 1D 3B 26      CRC_createTable .hibyte, .hiword
000034r 1  76 6B 4D 50  
000038r 1  ED F0 D6 CB  
00003Cr 1  9B 86 A0 BD

The code when run on the C64 "as is" calculates the CRC32 of the BASIC-ROM 901226-01 as $F833D117. It needs 2159152 cycles, that is, 263 cycles per byte. On a PAL C64, I achieve approx. 3.7 KB/s if I use the ZP address as given for CRC0.

I am currently out of ideas how to optimize it for speed (without adding too much additional space).

Can anybody give any hints?

I thought about using Y for anything. The only thing that seems to work is to use Y for the count memory location. If I try to use Y for a CRC0 or CRC3 to prevent memory accesses, I always have to store it anyway because I have to EOR it with A.

BigDumbDinosaur · Post by **BigDumbDinosaur** » Tue Jun 10, 2025 6:25 am

strik wrote:

I have a problem where I need a CRC32 on a memory-constraint (sic) 6502 system...The code when run on the C64 "as is" calculates the CRC32 of the BASIC-ROM 901226-01 as $F833D117. It needs 2159152 cycles, that is, 263 cycles per byte. On a PAL C64, I achieve approx. 3.7 KB/s if I use the ZP address as given for CRC0.

The performance you are seeing is probably close to the maximum with that algorithm. The C-64’s 6510 is pretty constrained with what it can do with arithmetic- and shift-intensive work, and a PAL C-64’s sub-1 MHz clock isn’t helping any.

In the below code, I made a small revision to your shift routines, which might speed up things a tiny bit.

Code: Select all

CRC0     = $FB                 ; current value of CRC
CRC1     = $FC
CRC2     = $FD
CRC3     = $FE

CRC_START = $A000
CRC_COUNT = $2000

.macro CRC_createTableEntry value,byte,word
    .byte byte(word(value))
.endmacro

.macro CRC_createTable byte,word
    CRC_createTableEntry $00000000, byte, word
    CRC_createTableEntry $1DB71064, byte, word
    CRC_createTableEntry $3B6E20C8, byte, word
    CRC_createTableEntry $26D930AC, byte, word
    CRC_createTableEntry $76DC4190, byte, word
    CRC_createTableEntry $6B6B51F4, byte, word
    CRC_createTableEntry $4DB26158, byte, word
    CRC_createTableEntry $5005713C, byte, word
    CRC_createTableEntry $EDB88320, byte, word
    CRC_createTableEntry $F00F9344, byte, word
    CRC_createTableEntry $D6D6A3E8, byte, word
    CRC_createTableEntry $CB61B38C, byte, word
    CRC_createTableEntry $9B64C2B0, byte, word
    CRC_createTableEntry $86D3D2D4, byte, word
    CRC_createTableEntry $A00AE278, byte, word
    CRC_createTableEntry $BDBDF21C, byte, word
.endmacro

crc32_table0:
    CRC_createTable .lobyte, .loword

crc32_table1:
    CRC_createTable .hibyte, .loword

crc32_table2:
    CRC_createTable .lobyte, .hiword

crc32_table3:
    CRC_createTable .hibyte, .hiword

Run:

        lda #>CRC_START
        ldx #>CRC_COUNT

CalcCRC:
        ; A = Start address high
        ; X = count
        sta readvalue+2
        stx count

        ; init CRC with $FFFF FFFF
        lda #$ff
        sta CRC0
        sta CRC1
        sta CRC2
        sta CRC3

CalcCRC_Loop:

readvalue:
        lda $FF00        ; dummy address for value
        eor CRC0
        sta CRC0
        and #$0f
        tax

        ; shift CRC 4 times to the right: (crc >> 4)
        lda CRC0

        lsr CRC3
        ror CRC2
        ror CRC1
        ror
        lsr CRC3
        ror CRC2
        ror CRC1
        ror
        lsr CRC3
        ror CRC2
        ror CRC1
        ror
        lsr CRC3
        ror CRC2
        ror CRC1
        ror
;
;	Changes...
;
;;;;;;;;   sta CRC0 ; <— *** old ***
;;;;;;;;   lda crc32_table0,x ; <— *** old ***
;;;;;;;;   eor CRC0 ; <— *** old ***
;
        eor crc32_table0,x ; <— *** new ***
;
        sta CRC0
        lda crc32_table1,x
        eor CRC1
        sta CRC1
        lda crc32_table2,x
        eor CRC2
        sta CRC2
        lda crc32_table3,x
        eor CRC3
        sta CRC3
        lda CRC0
        and #$0f
        tax

        ; shift CRC 4 times to the right: (crc >> 4)
        lda CRC0

        lsr CRC3
        ror CRC2
        ror CRC1
        ror
        lsr CRC3
        ror CRC2
        ror CRC1
        ror
        lsr CRC3
        ror CRC2
        ror CRC1
        ror
        lsr CRC3
        ror CRC2
        ror CRC1
        ror
;
;	Changes...
;
;;;;;;;;   sta CRC0 ; <— *** old ***
;;;;;;;;   lda crc32_table0,x ; <— *** old ***
;;;;;;;;   eor CRC0 ; <— *** old ***
;
        eor crc32_table0,x ; <— *** new ***
;
        sta CRC0
        lda crc32_table1,x
        eor CRC1
        sta CRC1
        lda crc32_table2,x
        eor CRC2
        sta CRC2
        lda crc32_table3,x
        eor CRC3
        sta CRC3

        ; CalcCRC_Loop is too far away, so check indirectly

        inc readvalue+1
        beq @not_high
        jmp CalcCRC_Loop

@not_high:
        inc readvalue+2
        dec count
        beq @done
        jmp CalcCRC_Loop

@done:
        ; invert the result
        lda CRC0
        eor #$ff
        sta CRC0
        lda CRC1
        eor #$ff
        sta CRC1
        lda CRC2
        eor #$ff
        sta CRC2
        lda CRC3
        eor #$ff
        sta CRC3
        rts

count:
        .byte 0

You might be able to squeeze a little more out of it by changing the manner in which you are handling your count variable—perhaps move it to page zero at $02.

BB8 · Post by **BB8** » Tue Jun 10, 2025 7:39 pm

On top of what BDD proposed, as you suggested you may use Y for CRC0: each "sta CRC0" and "lda CRC0" could be changed to "TAY" and "TYA"; it should give you 7 cycles per iteration (per byte).
The initial part would be:

Code: Select all

[...]
        ; init CRC with $FFFF FFFF
        ldy #$ff
        ;sty CRC0
        sty CRC1
        sty CRC2
        sty CRC3

CalcCRC_Loop:

        tya
readvalue:
        ;lda $FF00        ; dummy address for value
        ;eor CRC0
        ;sta CRC0
        eor $FF00
        tay

        and #$0f
        tax

        ; shift CRC 4 times to the right: (crc >> 4)
        tya
        ;lda CRC0
[...]

You still need the "sta CRC0" in the @done section.

Moving the table to ZP could give you 8 cycles per iteration, but you need to copy the table to ZP which requires a few hundred cycles (833c).

Code: Select all

		ldx #63
loop: lda tababs,x  ; abs,x
		sta tabzp,x	; zp,x
		dex
		bpl loop

(I don't think there's a chunk of safe 64 bytes in the ZP of a C64: I think you may have to disable interrupts and never go back to Basic)

strik · Post by **strik** » Tue Jun 10, 2025 8:45 pm

Thanks for your suggestions!

With the help of your proposal, @BigDumpDinosaur, the cycles to do a CRC32 of the BASIC ROM drop from 2,159,152 (= 3,738 Byte/s) to 2,059,534 (= 3,918 Byte/s).
@fhw72 optimised it on top of your code by re-arranging the code and using Y for indexed reading of the new value, so it drops even to 2,026,359 cycles (= 3,983 Byte/s).

I will try if I can use @BB8's suggestion - I've tried it before, but failed to get working code. This would partially undo @fhw72 optimizations, though, as he uses Y as index for the loading of the new value (instead of inc readvalue+1, there is only iny left). However, as the CRC0 access (or CRC3, whatever) is much more often than the reading of the new value, this might have a benefit, anyway.

BTW, this code is to be run in a more memory-constrained environment. I just use the C64 for testing because the environment is much easier to handle. In fact, I will avoid the ZP as good as possible, even $FB-$FE will not be used there, so I will be slower than this code here.

strik · Post by **strik** » Tue Jun 10, 2025 9:46 pm

@BB8, Your suggestion is good and even better than the optimizations from @fhw72.
It takes 2001525 cycles and thus, achieves 4032 Byte/s on a PAL C64.

Code: Select all

CRC0     = $FB                 ; current value of CRC
CRC1     = $FC
CRC2     = $FD
CRC3     = $FE

CRC_START = $A000
CRC_COUNT = $2000

.macro CRC_createTableEntry value,byte,word
    .byte byte(word(value))
.endmacro

.macro CRC_createTable byte,word
    CRC_createTableEntry $00000000, byte, word
    CRC_createTableEntry $1DB71064, byte, word
    CRC_createTableEntry $3B6E20C8, byte, word
    CRC_createTableEntry $26D930AC, byte, word
    CRC_createTableEntry $76DC4190, byte, word
    CRC_createTableEntry $6B6B51F4, byte, word
    CRC_createTableEntry $4DB26158, byte, word
    CRC_createTableEntry $5005713C, byte, word
    CRC_createTableEntry $EDB88320, byte, word
    CRC_createTableEntry $F00F9344, byte, word
    CRC_createTableEntry $D6D6A3E8, byte, word
    CRC_createTableEntry $CB61B38C, byte, word
    CRC_createTableEntry $9B64C2B0, byte, word
    CRC_createTableEntry $86D3D2D4, byte, word
    CRC_createTableEntry $A00AE278, byte, word
    CRC_createTableEntry $BDBDF21C, byte, word
.endmacro

crc32_table0:
    CRC_createTable .lobyte, .loword

crc32_table1:
    CRC_createTable .hibyte, .loword

crc32_table2:
    CRC_createTable .lobyte, .hiword

crc32_table3:
    CRC_createTable .hibyte, .hiword

Run:

        lda #>CRC_START
        ldx #>CRC_COUNT

CalcCRC:
        ; A = Start address high
        ; X = count
        sta readvalue+2
        stx count

        ; init CRC with $FFFF FFFF
        ldy #$ff
        sty CRC0     ; can be removed!
        sty CRC1
        sty CRC2
        sty CRC3

CalcCRC_Loop:

        tya
readvalue:
        eor $FF00        ; dummy address for value
        tay
        and #$0f
        tax

        ; shift CRC 4 times to the right: (crc >> 4)
        tya

        lsr CRC3
        ror CRC2
        ror CRC1
        ror
        lsr CRC3
        ror CRC2
        ror CRC1
        ror
        lsr CRC3
        ror CRC2
        ror CRC1
        ror
        lsr CRC3
        ror CRC2
        ror CRC1
        ror

        eor crc32_table0,x ; <— *** new ***

        tay
        lda crc32_table1,x
        eor CRC1
        sta CRC1
        lda crc32_table2,x
        eor CRC2
        sta CRC2
        lda crc32_table3,x
        eor CRC3
        sta CRC3
        tya
        and #$0f
        tax

        ; shift CRC 4 times to the right: (crc >> 4)
        tya

        lsr CRC3
        ror CRC2
        ror CRC1
        ror
        lsr CRC3
        ror CRC2
        ror CRC1
        ror
        lsr CRC3
        ror CRC2
        ror CRC1
        ror
        lsr CRC3
        ror CRC2
        ror CRC1
        ror

        eor crc32_table0,x ; <— *** new ***

        tay
        lda crc32_table1,x
        eor CRC1
        sta CRC1
        lda crc32_table2,x
        eor CRC2
        sta CRC2
        lda crc32_table3,x
        eor CRC3
        sta CRC3

        ; CalcCRC_Loop is too far away, so check indirectly

        inc readvalue+1
        beq @highbyte
        jmp CalcCRC_Loop

@highbyte:
        inc readvalue+2
        dec count

        beq @done
        jmp CalcCRC_Loop

@done:
        ; invert the result
        tya
        eor #$ff
        sta CRC0
        lda CRC1
        eor #$ff
        sta CRC1
        lda CRC2
        eor #$ff
        sta CRC2
        lda CRC3
        eor #$ff
        sta CRC3
        rts

count:
        .byte 0

BB8 · Post by **BB8** » Wed Jun 11, 2025 5:00 am

Well... having a need for speed and not using ZP is kind of a contradiction!

I don't know which machine you're targeting, anyway, if you have a need for speed you could work with only nibbles: you should double the LUT to have only the lower nibble populated (%0000xxxx), so in your code instead of RORs you could directly move to the next byte.
You should split every byte in two different bytes at the beginning, and use 8 bytes in the computation (CRC0 ... CRC7), but instead of RORing you could just pick the previous byte directly.

This would double the LUT in size, but the code should end to have the same length. The speed would increase (I guess) by 3-5 times.

Dr Jefyll · Post by **Dr Jefyll** » Wed Jun 11, 2025 1:53 pm

Yes, I had the same idea, or one that's highly similar. But --verbally, at least -- it's quite challenging to explain! So, this morning when I saw your post, BB8, I opted to make a diagram. Perhaps you can confirm if this is also what you were thinking.

: the nibbles of the CRC are aligned in memory, and the tables match this alignment; CRC aligned.png (3.32 KiB) Viewed 2079 times

To begin, I'll review. Here (above) is a diagram showing the approach used upthread. I'll refer to the "not-so-bad solution" in this post. As shown in the diagram, the CRC is held in a series of bytes in memory. At the top of the loop (label readvalue:), an input byte is fetched. Then in the rest of the loop, most of the work has to do with shifting the CRC right by 4 bits, fetching an entry from each of the four lookup tables and XORing that entry into the corresponding byte of the CRC... then shifting right again and fetching/XORing again (because there are two nibbles in the input byte).

IOW, for each pass through the loop the CRC gets updated twice... and also the CRC gets shifted twice... which requires a heckuva lot of effort! (As we know, the CPU can't do 4-bit shifts; we can only ask it to shift the entire CRC by one bit, then do it again and again and again.)

It's slow, but the 4-bit shifts do achieve the goal of aligning the CRC so it matches the tables. In contrast, my idea involves temporarily (mis-)aligning the tables to match the CRC! To do that, we need an additional set of tables (as shown below).

: Tables for updating a CRC that's "not in the right place" (ie, isn't shifted yet).
Note the use of color. In the far left table the upper nibbles of all the bytes are zero; likewise, in the far right table the lower nibbles of all the bytes are zero. Thus when one of these table entries gets XORed into a byte of the CRC, one nibble of the CRC byte will be unaffected.; CRC mis-aligned.png (3.38 KiB) Viewed 2079 times

With my approach, we still do the XOR business twice for each pass through the loop, but the first set of four 1-bit shifts doesn't occur. Instead, we postpone that shift and simply work on a CRC that's "not in the right place" (ie, isn't shifted yet). For this we use the alternate set of tables shown above, and we proceed with the first XOR step.

Now we're due to shift another four bits... in addition to the four bits we already postponed. IOW, now we can shift by eight bits and things will come out right! Then we are "up to date" -- all shifts done, nothing postponed. This means the second set of XOR steps can use the "normal" set of tables shown in my first diagram (rather than the mis-aligned tables in the second diagram). So, in summary, every pass through the loop does a mis-aligned XOR step, a byte-shift (in effect, two nibble-shifts at once), then an aligned XOR step.

Doing one eight-bit shift of the CRC -- IOW a byte-move -- is immensely faster than doing two sets of four 1-bit shifts, so the additional complexity pays off very well indeed.

I haven't tried coding any of this, so perhaps I'm at risk for a big head-slap. But, all being well, there may even be potential for an additional speedup as follows. Each XOR step requires some LDA's from and some STA's to the CRC. So, if the STA's were to occur to an off-by-one address, then the LDA/STA's for the first bunch or XORs could also accomplish the byte-shift at no extra cost.

-- Jeff

[Edits] [More edits!]

BB8 · Post by **BB8** » Wed Jun 11, 2025 4:33 pm

@Jeff: I meant something different. With your approach you are still doing a set of 1-bit shifts, while I want to eliminate that too. And you'd need 4 tables for nibbles in low and high positions.

My idea would be to have everything in the low nibble.
There should be 2 tables: one with the low nibbles of the current table, the other with the high nibbles but moved in the low position. So each table would have values in the form of %0000xxxx.

Then the nibbles of CRC0...CRC3 would be split in 8 new variables each containing half the byte of current variables:
newCRC0 = (hi(CRC0) >> 4)
newCRC1 = (lo(CRC0))
newCRC2 = (hi(CRC1) >> 4)
newCRC3 = (lo(CRC1))
newCRC4 = (hi(CRC2) >> 4)
newCRC5 = (lo(CRC2))
newCRC6 = (hi(CRC3) >> 4)
newCRC7 = (lo(CRC3))
(the nibbles will always be in the "low" position)

So the mechanic of the algo stays almost the same, but instead of 4x 1-bit shifts, you can just copy a byte (nibble) to the next position (i.e. lda newCRC1 / EOR table,x / sta newCRC2: voilà, a EOR plus nibble shift). Probably the EORs will have to be done in reverse order (newCRC7 ---> ... ---> newCRC0)
Then at the end the nibbles must be merged, but this will happen only once.

This requires 8 CRCx variables (instead of 4) and to double the table.

fhw72 · Post by **fhw72** » Wed Jun 11, 2025 7:52 pm

@BB8:

I took your idea and did it on C code...

Code: Select all

uint32_t calcCRC32_Nib(uint8_t crcReg[8], const uint8_t* buffer, uint32_t size)
{
   static const uint32_t crcTable7[] = {0x0, 0x1, 0x3, 0x2, 0x7, 0x6, 0x4, 0x5, 0xE, 0xF, 0xD, 0xC, 0x9, 0x8, 0xA, 0xB};
   static const uint32_t crcTable6[] = {0x0, 0xD, 0xB, 0x6, 0x6, 0xB, 0xD, 0x0, 0xD, 0x0, 0x6, 0xB, 0xB, 0x6, 0x0, 0xD};
   static const uint32_t crcTable5[] = {0x0, 0xB, 0x6, 0xD, 0xD, 0x6, 0xB, 0x0, 0xB, 0x0, 0xD, 0x6, 0x6, 0xD, 0x0, 0xB};
   static const uint32_t crcTable4[] = {0x0, 0x7, 0xE, 0x9, 0xC, 0xB, 0x2, 0x5, 0x8, 0xF, 0x6, 0x1, 0x4, 0x3, 0xA, 0xD};
   static const uint32_t crcTable3[] = {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF};
   static const uint32_t crcTable2[] = {0x0, 0x0, 0x0, 0x0, 0x1, 0x1, 0x1, 0x1, 0x3, 0x3, 0x3, 0x3, 0x2, 0x2, 0x2, 0x2};
   static const uint32_t crcTable1[] = {0x0, 0x6, 0xC, 0xA, 0x9, 0xF, 0x5, 0x3, 0x2, 0x4, 0xE, 0x8, 0xB, 0xD, 0x7, 0x1};
   static const uint32_t crcTable0[] = {0x0, 0x4, 0x8, 0xC, 0x0, 0x4, 0x8, 0xC, 0x0, 0x4, 0x8, 0xC, 0x0, 0x4, 0x8, 0xC};

   uint8_t y = crcReg[0];
   while (size--)
   {
       uint8_t x;
#if 1
       uint8_t a = y;      // TYA
       a ^= *buffer++;     // EOR $buffer (increment missing still)
       x = a;              // TAX
       a &= 0x0F;          // AND #$0F
       y = a;              // TAY
       a = x;              // TXA
       a >>= 4;            // 4* LSR
       a ^= crcReg[1];     // EOR crcReg[1]
       crcReg[1] = a;      // STA crcReg[1]
#else
       crcReg[1] ^= *buffer >> 4;
       y ^= *buffer++ & 0x0F;
#endif

       x = y;
       y = crcReg[1] ^ crcTable0[x];
       crcReg[1] = crcReg[2] ^ crcTable1[x];
       crcReg[2] = crcReg[3] ^ crcTable2[x];
       crcReg[3] = crcReg[4] ^ crcTable3[x];
       crcReg[4] = crcReg[5] ^ crcTable4[x];
       crcReg[5] = crcReg[6] ^ crcTable5[x];
       crcReg[6] = crcReg[7] ^ crcTable6[x];
       crcReg[7] = crcTable7[x];
       x = y;
       y = crcReg[1] ^ crcTable0[x];
       crcReg[1] = crcReg[2] ^ crcTable1[x];
       crcReg[2] = crcReg[3] ^ crcTable2[x];
       crcReg[3] = crcReg[4] ^ crcTable3[x];
       crcReg[4] = crcReg[5] ^ crcTable4[x];
       crcReg[5] = crcReg[6] ^ crcTable5[x];
       crcReg[6] = crcReg[7] ^ crcTable6[x];
       crcReg[7] = crcTable7[x];
   }
   crcReg[0] = y;
   return ~((crcReg[7] << 28) | (crcReg[6] << 24) | (crcReg[5] << 20) | (crcReg[4] << 16) |
            (crcReg[3] << 12) | (crcReg[2] << 8) | (crcReg[1] << 4) | (crcReg[0] << 0));
}

Code needs to be called like this:

uint8_t crcReg[8] = {0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF};
uint32_t crc = calcCRC32_Nib(crcReg, buffer, sizeof(buffer) );

More ideas?

Dr Jefyll · Post by **Dr Jefyll** » Thu Jun 12, 2025 4:15 am

BB8, what you're proposing is a pleasant surprise, similar to yet different from what I had in mind!

Compared with the previous approaches upthread, I think yours and mine both promise a compelling speed increase, although compared to previous approaches they both require approximately twice as much memory for the lookup tables. (Your lookup tables consume an entire byte for each nibble that's stored. That's partially true for my tables, too, but the main issue is that mine store the same nibbles in two different places). Is this extra memory consumption justified? It'll depend on strik's priorities; ie, to what extent is compactness valued, compared with the value placed on speed.

I'd have to say your approach more elegant than mine, BB8, so hats off to you for that! But is it faster? I'm not convinced... at least not yet. I like how...

BB8 wrote:

lda newCRC1 / EOR table,x / sta newCRC2: voilà, a EOR plus nibble shift

... but isn't it true that this LDA/EOR/STA combo has to happen sixteen times for each pass through the loop? I anticipate that my version can do the same with only nine LDA/EOR/STA combos (and as I mentioned in my previous post, I suspect a free byte-shift can be wangled in remotely the same way you're getting the free "plus nibble shift" just quoted).

It would be cool to see both of these approaches implemented in assembly language! strik, do you think you'd be up for doing that? Are there details that still need explaining? Dunno 'bout BB8, but for me it might be a while before I could get around to doing the actual coding.

-- Jeff

GARTHWILSON · Post by **GARTHWILSON** » Thu Jun 12, 2025 4:33 am

I haven't been following this closely; but if it would help, remember there's David Galloway's efficient, 8-byte, 12-cycle nybble swap, at http://6502.org/source/general/SWN.html .

BB8 · Post by **BB8** » Thu Jun 12, 2025 5:25 am

@Jeff: I'll have to review more closely what you're proposing, I haven't analyzed it deeply; I'll find some time later or tomorrow. It appears to me that you'll need to have 4 tables:
- one with current low nibbles in lsb position
- one with current low nibbles in msb position
- one with current high nibbles in lsb position
- one with current high nibbles in msb position
Otherwise (if you compact those in just two tables) I think you'll have to mask out the unrequired nibble.
Possibly those are too many tables for the OP.

About what I was proposing, I haven't written anything yet; you're right, 16 loads would be necessary; maybe there's a way to avoid loading each value twice, and do two consecutive EORs.

BB8 · Post by **BB8** » Thu Jun 12, 2025 2:32 pm

if I didn't make mistakes...:

Edit: this table is in fact wrong; I posted a corrected one later in the thread

Dr Jefyll · Post by **Dr Jefyll** » Thu Jun 12, 2025 3:35 pm

Whoops! Sorry, BB8 -- minutes ago I posted some much-needed edits to my post here. I hope this clarifies my proposal. I do find this a challenging topic to explain!

Until now I didn't notice your latest post (which I have not yet examined closely).

-- Jeff

ps- Re my proposal, you remarked,"I think you'll have to mask out the unrequired nibble." Sorry this wasn't properly explained at first. Pls see changes I made to the File Comment on the 2nd image.

fhw72 · Post by **fhw72** » Thu Jun 12, 2025 4:28 pm

I'm away until Saturday but I'll try to realize @Jeff proposal.
Nice challenge

CRC32 with "nibble" table

CRC32 with "nibble" table

Re: CRC32 with "nibble" table

Re: CRC32 with "nibble" table

Re: CRC32 with "nibble" table

Re: CRC32 with "nibble" table

Re: CRC32 with "nibble" table

Re: CRC32 with "nibble" table

Re: CRC32 with "nibble" table

Re: CRC32 with "nibble" table

Re: CRC32 with "nibble" table

Re: CRC32 with "nibble" table

Re: CRC32 with "nibble" table

Re: CRC32 with "nibble" table

Re: CRC32 with "nibble" table

Re: CRC32 with "nibble" table