diff --git a/src/libc/memcpy.src b/src/libc/memcpy.src index f6efc4642..65a9dcffd 100644 --- a/src/libc/memcpy.src +++ b/src/libc/memcpy.src @@ -12,17 +12,30 @@ .else _memcpy: - ld iy, -1 + ; size > 0 : 25F + 15R + 1 + LDIR + ; size >= 65536 : 32F + 16R + 3 + LDIR + ; size == 0 : 26F + 13R + 2 + ; size >= 65536 + 7F + 1R + 2 (only when the low 16 bits are zero) + + ld iy, 0 add iy, sp - ld bc, (iy + 10) ; Load count - sbc hl, hl - add hl, bc - jr nc, .L.zero - ld de, (iy + 4) ; Load destination - ld hl, (iy + 7) ; Load source + ld bc, (iy + 9) ; Load count + ld a, c + or a, b + ld de, (iy + 3) ; Load destination + jr z, .L.maybe_zero +.L.not_zero: + ld hl, (iy + 6) ; Load source ldir -.L.zero: - ld hl, (iy + 4) ; Return the destination pointer + ld hl, (iy + 3) ; Return the destination pointer + ret + +.L.maybe_zero: + ; low 16 bits are zero + or a, (iy + 11) ; test upper 8 bits + jr nz, .L.not_zero ; size >= 65536 + ; size == 0 + ex de, hl ret .endif diff --git a/src/libc/memmove.src b/src/libc/memmove.src index 74a465c8f..2077830b0 100644 --- a/src/libc/memmove.src +++ b/src/libc/memmove.src @@ -15,87 +15,103 @@ ; Optimized for when src != dst _memmove: - ; src > dst | LDIR | 32F + 15R + 1 - ; src < dst | LDDR | 35F + 12R + 2 - ; src = dst | LDDR | 35F + 12R + 2 - ; zero size | | 24F + 12R + 2 + ; src > dst | LDIR | 31F + 15R + 2 + ; src < dst | LDDR | 33F + 12R + 1 + ; src = dst | LDIR | 31F + 15R + 2 + ; zero size | | 26F + 10R + 2 + ; size >= 65536 + 7F + 1R + 2 (only when the low 16 bits are zero) - ld iy, -1 + ld iy, 0 add iy, sp - ld bc, (iy + 10) - sbc hl, hl - add hl, bc - jr nc, .L.zero - ld hl, (iy + 7) - ld de, (iy + 4) + ld bc, (iy + 9) + ld a, c + or a, b + ld de, (iy + 3) + jr z, .L.maybe_zero +.L.not_zero: + ld hl, (iy + 6) sbc hl, de - ; src <= dst - jr c, .L.copy_backwards - ; src > dst -; .copy_forwards: - add hl, de - inc hl - ldir -.L.zero: - ld hl, (iy + 4) - ret - -.L.copy_backwards: + ; src >= dst + jr nc, .L.copy_forwards + ; src < dst +; .L.copy_backwards: ; move HL and DE to the end + dec de ; DE = dst - 1 + ex de, hl + add hl, bc ; HL = dst + size - 1, DE = src - dst ex de, hl - add hl, bc - ex de, hl ; HL = src - dst - 1, DE = dst + size - add hl, de ; HL = src + size - 1 - dec de ; DE = dst + size - 1 + add hl, de ; HL = src + size - 1, DE = dst + size - 1 lddr ex de, hl inc hl ret +.L.copy_forwards: + add hl, de + ldir + ld hl, (iy + 3) + ret + +.L.maybe_zero: + ; low 16 bits are zero + or a, (iy + 11) ; test upper 8 bits + jr nz, .L.not_zero ; size >= 65536 + ; size == 0 + ex de, hl + ret + .else ; Optimized for when src == dst _memmove: - ; src > dst | LDIR | 33F + 15R + 2 - ; src < dst | LDDR | 36F + 12R + 2 - ; src = dst | | 29F + 12R + 2 - ; zero size | | 24F + 12R + 2 + ; src > dst | LDIR | 31F + 15R + 2 + ; src < dst | LDDR | 34F + 12R + 2 + ; src = dst | | 27F + 12R + 2 + ; zero size | | 26F + 10R + 2 + ; size >= 65536 + 7F + 1R + 2 (only when the low 16 bits are zero) - ld iy, -1 + ld iy, 0 add iy, sp - ld bc, (iy + 10) - sbc hl, hl - add hl, bc - jr nc, .L.zero - ld de, (iy + 4) - ld hl, (iy + 7) - or a, a + ld bc, (iy + 9) + ld a, c + or a, b + ld de, (iy + 3) + jr z, .L.maybe_zero +.L.not_zero: + ld hl, (iy + 6) sbc hl, de ; src < dst - jr c, .copy_backwards + jr c, .L.copy_backwards ; src >= dst ; .L.copy_forwards: add hl, de ; src == dst - ret z ; skips LDIR when src == dst + ret z ; skips LDIR when src == dst ; src > dst ldir -.L.zero: - ld hl, (iy + 4) + ld hl, (iy + 3) ret .L.copy_backwards: ; move HL and DE to the end - dec de ; DE = dst - 1 + dec de ; DE = dst - 1 ex de, hl - add hl, bc ; HL = dst + size - 1, DE = src - dst + add hl, bc ; HL = dst + size - 1, DE = src - dst ex de, hl - add hl, de ; HL = src + size - 1, DE = dst + size - 1 + add hl, de ; HL = src + size - 1, DE = dst + size - 1 lddr ex de, hl inc hl ret +.L.maybe_zero: + ; low 16 bits are zero + or a, (iy + 11) ; test upper 8 bits + jr nz, .L.not_zero ; size >= 65536 + ; size == 0 + ex de, hl + ret + .endif .endif diff --git a/src/libc/mempcpy.src b/src/libc/mempcpy.src index 4e0a1c96e..7415fa75b 100644 --- a/src/libc/mempcpy.src +++ b/src/libc/mempcpy.src @@ -5,26 +5,6 @@ .global _mempcpy .type _mempcpy, @function -.if 0 - -; faster when count is zero -_mempcpy: - ld iy, -1 - add iy, sp - ld bc, (iy + 10) ; Load count - sbc hl, hl - add hl, bc - ld hl, (iy + 4) ; Load destination - ret nc ; zero bytes to copy - ld de, (iy + 7) ; Load source - ex de, hl - ldir - ex de, hl - ret - -.else - -; faster in full execution case by 0F + 1 clock cycles _mempcpy: ld iy, -1 add iy, sp @@ -38,5 +18,3 @@ _mempcpy: .L.zero_byte_copy: ex de, hl ret - -.endif diff --git a/test/standalone/asprintf_fprintf/src/fill_mem32.s b/test/standalone/asprintf_fprintf/src/fill_mem32.s new file mode 100644 index 000000000..a53442d10 --- /dev/null +++ b/test/standalone/asprintf_fprintf/src/fill_mem32.s @@ -0,0 +1,26 @@ + .assume adl = 1 + + .section .text + + .global _fill_mem32 + +; void fill_mem32(void *dst, size_t bytes, uint32_t pattern) +_fill_mem32: + ld iy, 0 + add iy, sp + ld de, (iy + 3) + ld hl, (iy + 6) + ld bc, 4 + sbc hl, bc + ; return if bytes <= pattern_size + ret c + ret z + push hl + ; copy pattern once + lea hl, iy + 9 + ldir + pop bc + ; now copy (bytes - pattern_size) + ld hl, (iy + 3) + ldir + ret diff --git a/test/standalone/asprintf_fprintf/src/main.c b/test/standalone/asprintf_fprintf/src/main.c index 181d5f8f7..483fc3e00 100644 --- a/test/standalone/asprintf_fprintf/src/main.c +++ b/test/standalone/asprintf_fprintf/src/main.c @@ -49,6 +49,9 @@ void *T_memccpy(void *__restrict dest, const void *__restrict src, int c, size_t void *T_mempcpy(void *__restrict dest, const void *__restrict src, size_t n) __attribute__((nonnull(1, 2))); +void *T_memchr(const void *s, int c, size_t n) + __attribute__((nonnull(1))); + void *T_memrchr(const void *s, int c, size_t n) __attribute__((nonnull(1))); @@ -92,6 +95,7 @@ void T_bzero(void* s, size_t n); #define T_memcmp memcmp #define T_memccpy memccpy #define T_mempcpy mempcpy +#define T_memchr memchr #define T_memrchr memrchr #define T_memmem memmem #define T_memrmem memrmem @@ -994,6 +998,54 @@ int strchrnul_test(void) { return 0; } +int mem65536_test(void) { + void fill_mem32(void *dst, size_t bytes, uint32_t pattern); + + uint8_t * const dst = (uint8_t*)0xD40000; + const size_t screen_size = 320 * 240 * 2; + memset(dst, 0, screen_size); + const size_t B16 = 65536; + const size_t B17 = 131072; + + /* test return values */ + + C(T_memcpy(SINK, SINK, B16) == SINK); + C(T_memcpy(SINK, SINK, B17) == SINK); + + C(T_memmove(SINK, SINK, B16) == SINK); + C(T_memmove(SINK, SINK, B17) == SINK); + + C(T_memmove(SINK + 16, SINK, B16) == SINK + 16); + C(T_memmove(SINK + 16, SINK, B17) == SINK + 16); + + C(T_memmove(SINK, SINK + 16, B16) == SINK); + C(T_memmove(SINK, SINK + 16, B17) == SINK); + + /* test memcpy and memmove when size is a non-zero multiple of 65536 */ + + fill_mem32(dst + screen_size - B16, B16, 0x78563412); + C(T_memcpy(dst + 32, dst + screen_size - B16, B16) == dst + 32); + C(T_memchr(dst, 0x00, 32) == dst); + C(T_memchr(dst, 0x12, 32) == NULL_ptr); + C(T_memchr(dst, 0x12, 33) == dst + 32); + C(T_memrchr(dst, 0x78, 32 + B16 + 32) == dst + 32 + B16 - 1); + const uint32_t pattern_1 = 0xA3A0A1A0; + const uint32_t pattern_2 = 0xFECDAB89; + fill_mem32(dst, 32, pattern_1); + fill_mem32(dst + 24576, B16, pattern_2); + + C(T_memmove(dst + 61, dst, B16) == dst + 61); + C(T_memmem(dst, B17, &pattern_1, sizeof(pattern_1)) == dst); + C(T_memrmem(dst, B17, &pattern_1, sizeof(pattern_1)) == dst + 61 - 4 + 32); + C(T_memmove(dst + 24578, dst, B16) == dst + 24578); + C(T_memmem(dst, B16, &pattern_1, sizeof(pattern_1)) == dst + 0); + C(T_memrmem(dst, B16, &pattern_1, sizeof(pattern_1)) == dst + 24578 + 61 + 32 - 4); + C(T_memmem(dst, B16, &pattern_2, sizeof(pattern_2)) == dst + 24576 + 24578 + 61); + C(T_memrmem(dst, B16, &pattern_2, sizeof(pattern_2)) == dst + B16 - 4u - (((24578u - 24576u) - 61u) % 4u)); + + return 0; +} + int run_tests(void) { int ret = 0; /* boot_asprintf */ @@ -1027,6 +1079,9 @@ int run_tests(void) { TEST(strrstr_test()); TEST(strchrnul_test()); + TEST(mem65536_test()); + os_ClrHome(); + return 0; } diff --git a/test/standalone/asprintf_fprintf/src/rename.s b/test/standalone/asprintf_fprintf/src/rename.s index aa23678af..e48903467 100644 --- a/test/standalone/asprintf_fprintf/src/rename.s +++ b/test/standalone/asprintf_fprintf/src/rename.s @@ -2,7 +2,7 @@ .section .text - .global _T_memset, _T_memcpy, _T_memmove, _T_memcmp, _T_memccpy, _T_mempcpy, _T_memrchr, _T_memmem, _T_memrmem + .global _T_memset, _T_memcpy, _T_memmove, _T_memcmp, _T_memccpy, _T_mempcpy, _T_memchr, _T_memrchr, _T_memmem, _T_memrmem .global _T_strlen, _T_strcmp, _T_strncmp, _T_stpcpy, _T_stpncpy, _T_strlcat, _T_strchrnul, _T_strrstr .global _T_bzero @@ -18,6 +18,8 @@ _T_memccpy: jp _memccpy _T_mempcpy: jp _mempcpy +_T_memchr: + jp _memchr _T_memrchr: jp _memrchr _T_memmem: @@ -51,6 +53,6 @@ _T_bzero: _NULL_ptr: db $00, $00, $00 - .extern _memset, _memcpy, _memmove, _memcmp, _memccpy, _mempcpy, _memrchr, _memmem, _memrmem + .extern _memset, _memcpy, _memmove, _memcmp, _memccpy, _mempcpy, _memchr, _memrchr, _memmem, _memrmem .extern _strlen, _strcmp, _strncmp, _stpcpy, _stpncpy, _strlcat, _strchrnul, _strrstr .extern _bzero