I have a problem where I need a CRC32 on a memory-constraint 6502 system (no 65C02 or other allowed, just plain NMOS-6502). One the one hand, it should be somehow fast (which makes http://6502.org/source/integers/crc.htm not optimal), but also not to take too much space (LUT normally take 256 * 4 = 1024 Byte, for example at http://6502.org/source/integers/crc.htm).
I searched for algorithms that use smaller LUT, but are still faster than doing the bit-banging one by one.
I found that one here from https://create.stephan-brumme.com/crc32/Crc32.cpp:
Code: Select all
/// compute CRC32 (half-byte algoritm)
uint32_t crc32_halfbyte(const void* data, size_t length, uint32_t previousCrc32)
{
uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF
const uint8_t* current = (const uint8_t*) data;
/// look-up table for half-byte, same as crc32Lookup[0][16*i]
static const uint32_t Crc32Lookup16[16] =
{
0x00000000,0x1DB71064,0x3B6E20C8,0x26D930AC,0x76DC4190,0x6B6B51F4,0x4DB26158,0x5005713C,
0xEDB88320,0xF00F9344,0xD6D6A3E8,0xCB61B38C,0x9B64C2B0,0x86D3D2D4,0xA00AE278,0xBDBDF21C
};
while (length-- != 0)
{
crc = Crc32Lookup16[(crc ^ *current ) & 0x0F] ^ (crc >> 4);
crc = Crc32Lookup16[(crc ^ (*current >> 4)) & 0x0F] ^ (crc >> 4);
current++;
}
return ~crc; // same as crc ^ 0xFFFFFFFF
}
Code: Select all
uint32_t calcCRC32(uint32_t crc, const uint8_t* buffer, uint32_t size)
{
// Nibble table for 0xEDB88320 polynomial
static const uint32_t crcTable[] =
{
0x00000000, 0x1DB71064, 0x3B6E20C8, 0x26D930AC, 0x76DC4190, 0x6B6B51F4, 0x4DB26158, 0x5005713C,
0xEDB88320, 0xF00F9344, 0xD6D6A3E8, 0xCB61B38C, 0x9B64C2B0, 0x86D3D2D4, 0xA00AE278, 0xBDBDF21C
};
while (size--)
{
crc ^= *buffer++;
crc = (crc >> 4) ^ crcTable[crc & 0x0F]; // 1st nibble
crc = (crc >> 4) ^ crcTable[crc & 0x0F]; // 2nd nibble
}
return crc;
}
// Full buffer with XOR inout
uint32_t crc = ~calcCRC32(0xFFFFFFFF, buffer, sizeof(buffer));
Code: Select all
CRC0 = $FB ; current value of CRC
;CRC0 = $2000 ; current value of CRC
CRC1 = CRC0 + 1 ; not necessarily contiguous
CRC2 = CRC1 + 1 ; not necessarily contiguous
CRC3 = CRC2 + 1 ; not necessarily contiguous
CRC_START = $A000
CRC_COUNT = $2000
.macro CRC_createTableEntry value,byte,word
.byte byte(word(value))
.endmacro
.macro CRC_createTable byte,word
CRC_createTableEntry $00000000, byte, word
CRC_createTableEntry $1DB71064, byte, word
CRC_createTableEntry $3B6E20C8, byte, word
CRC_createTableEntry $26D930AC, byte, word
CRC_createTableEntry $76DC4190, byte, word
CRC_createTableEntry $6B6B51F4, byte, word
CRC_createTableEntry $4DB26158, byte, word
CRC_createTableEntry $5005713C, byte, word
CRC_createTableEntry $EDB88320, byte, word
CRC_createTableEntry $F00F9344, byte, word
CRC_createTableEntry $D6D6A3E8, byte, word
CRC_createTableEntry $CB61B38C, byte, word
CRC_createTableEntry $9B64C2B0, byte, word
CRC_createTableEntry $86D3D2D4, byte, word
CRC_createTableEntry $A00AE278, byte, word
CRC_createTableEntry $BDBDF21C, byte, word
.endmacro
crc32_table0:
CRC_createTable .lobyte, .loword
crc32_table1:
CRC_createTable .hibyte, .loword
crc32_table2:
CRC_createTable .lobyte, .hiword
crc32_table3:
CRC_createTable .hibyte, .hiword
Run:
lda #>CRC_START
ldx #>CRC_COUNT
CalcCRC:
; A = Start address high
; X = count
sta readvalue+2
stx count
; init CRC with $FFFF FFFF
lda #$ff
sta CRC0
sta CRC1
sta CRC2
sta CRC3
CalcCRC_Loop:
readvalue:
lda $FF00 ; dummy address for value
eor CRC0
sta CRC0
and #$0f
tax
; shift CRC 4 times to the right: (crc >> 4)
lda CRC0
lsr CRC3
ror CRC2
ror CRC1
ror
lsr CRC3
ror CRC2
ror CRC1
ror
lsr CRC3
ror CRC2
ror CRC1
ror
lsr CRC3
ror CRC2
ror CRC1
ror
sta CRC0
lda crc32_table0,x
eor CRC0
sta CRC0
lda crc32_table1,x
eor CRC1
sta CRC1
lda crc32_table2,x
eor CRC2
sta CRC2
lda crc32_table3,x
eor CRC3
sta CRC3
lda CRC0
and #$0f
tax
; shift CRC 4 times to the right: (crc >> 4)
lda CRC0
lsr CRC3
ror CRC2
ror CRC1
ror
lsr CRC3
ror CRC2
ror CRC1
ror
lsr CRC3
ror CRC2
ror CRC1
ror
lsr CRC3
ror CRC2
ror CRC1
ror
sta CRC0
lda crc32_table0,x
eor CRC0
sta CRC0
lda crc32_table1,x
eor CRC1
sta CRC1
lda crc32_table2,x
eor CRC2
sta CRC2
lda crc32_table3,x
eor CRC3
sta CRC3
; CalcCRC_Loop is too far away, so check indirectly
inc readvalue+1
beq @not_high
jmp CalcCRC_Loop
@not_high:
inc readvalue+2
dec count
beq @done
jmp CalcCRC_Loop
@done:
; invert the result
lda CRC0
eor #$ff
sta CRC0
lda CRC1
eor #$ff
sta CRC1
lda CRC2
eor #$ff
sta CRC2
lda CRC3
eor #$ff
sta CRC3
rts
count:
.byte 0
crc32_table0 takes the lowest byte of each LUT entry, crc32_table1 the next significant byte, until crc32_table3, which takes the most significant byte of the LUT entry.
I put the LUT at the beginning, so that I am sure that it does not cross a page bondary.
The result of it is:
Code: Select all
000000r 1 crc32_table0:
000000r 1 00 64 C8 AC CRC_createTable .lobyte, .loword
000004r 1 90 F4 58 3C
000008r 1 20 44 E8 8C
00000Cr 1 B0 D4 78 1C
000010r 1
000010r 1 crc32_table1:
000010r 1 00 10 20 30 CRC_createTable .hibyte, .loword
000014r 1 41 51 61 71
000018r 1 83 93 A3 B3
00001Cr 1 C2 D2 E2 F2
000020r 1
000020r 1 crc32_table2:
000020r 1 00 B7 6E D9 CRC_createTable .lobyte, .hiword
000024r 1 DC 6B B2 05
000028r 1 B8 0F D6 61
00002Cr 1 64 D3 0A BD
000030r 1
000030r 1 crc32_table3:
000030r 1 00 1D 3B 26 CRC_createTable .hibyte, .hiword
000034r 1 76 6B 4D 50
000038r 1 ED F0 D6 CB
00003Cr 1 9B 86 A0 BD
I am currently out of ideas how to optimize it for speed (without adding too much additional space).
Can anybody give any hints?
I thought about using Y for anything. The only thing that seems to work is to use Y for the count memory location. If I try to use Y for a CRC0 or CRC3 to prevent memory accesses, I always have to store it anyway because I have to EOR it with A.