Gimp/plug-ins/script-fu/test/tests/TS/string-escape.scm

; test string escape sequences

; An "escape sequence" is a sequence of characters that,
; when parsing a string, yields a single character.
; All escape sequences start with the backslash.

; TS is unicode: lengths are in unichars, not bytes

; 0xff is the C language notation for a hex constant

; Many tests are lax using string-length


; We can't test certain errors since they terminate
;  - Doublequote without trailing doublequote
;  - buffer overflow
;  - short hex escapes (<2 hex digits)


(test! "escaped doublequote")
(assert `(= (string-length "\"")   1))

; escaped newline, tab, carriage return
(assert `(= (string-length "\n")   1))
(assert `(= (string-length "\t")   1))
(assert `(= (string-length "\r")   1))

(test! "escaped backslash")
; escaped backslash, stands for itself
(assert `(= (string-length "\\")   1))

(test! "escaped other chars, ASCII")
; any other escaped char, that is not an octal digit, stands for itself
(assert `(= (string-length "\a")   1))

(test! "escaped other chars, unichar")
(assert `(= (string-length "\λ")   1))


; !!! Note that readable sequences for sharp constants for control chars
; are not suitable in strings.
; #\tab is not a sharp constant expression, and \tab is not a string escape
(assert `(= (string-length "\tab")     3))


; octal escape sequences
; FUTURE obsolete these: we don't need to support both hex and octal.

(test! "octal escapes")

(test! "octal NUL")
; one digit octal sequence
; NUL character, a zero byte, yields a string, but empty
(assert `(= (string-length "\0")   0))

; two digit octal sequence
; 0o11 is tab
(assert `(string=? "\11"  "\t"))

(test! "octal escaped characters match non-escaped ASCII characters")
; A is 65 is 0o101
(assert `(string=? "\101"  "A"))


; Three digit octal sequences that don't fit in a byte.
; Comments in the code says it should yield an error.
; So < 255, which is 0o377, should work.

(test! "octal 377")
; Yields a sequence of bytes that is not proper UTF-8 encoded, string length 0
(assert `(= (string-length "\377")   0))


; (test! "octal 400 yields error")
; In v2 the max value is 255, that fits in a byte.
; FIXME: the code comments says 0x400 should yield an error
;(assert-error `(string-length "\400")
;                "Error: Error reading string")

; !!! But in UTF-8 0x377==255 is encoded in two bytes
; and yields LATIN SMALL LETTER Y WITH DIAERESIS
; !!! length in chars is 1, length in bytes is 2.
; FUTURE (assert `(= (string-length "\377")   1))

; FUTURE: if we don't obsolete octal escapes altogether,
; then three or four octal digits should be allowed.
;(test! "octal 777")
; 0o777 is 0x1ff
; 1 char, encoded as 2 bytes.
; (assert `(= (string-length "\777")   1))
; TODO test the string is two-bytes
; we don't have string-length-bytes function

; four octal digits yields two char and three bytes.
; (assert `(= (string-length "\3777")   2))
; TODO test the second char is '7'


(test! "hex escapes")

(test! "hex NUL")
; NUL character, a zero byte, yields a string, but empty
(assert `(= (string-length "\x0000")   0))

;(test! "short hex escape")
; TODO Can't be tested, aborts interpreter, parsing fails
; maybe wrapping it in a string-port
; require at least two hex digits
;(assert-error `(string-length "\x")
;             "Error: Error reading string")
;(assert-error `(string-length "\x0")
;             "Error: Error reading string")

(test! "2 digit hex escape, ASCII")
; yields A
(assert `(= (string-length "\x41")   1))

(test! "2 digit hex escape, non-ASCII > 127")
; FIXME, fails string length 0 i.e. returns EOF object
; See scheme.c line 1957 *p++=c is pushing one byte
;
; Yields LATIN SMALL LETTER Y WITH DIAERESIS
; Yields one character of two UTF-8 bytes.
;(assert `(= (string-length "\xff")   1))

; Uppercase \XFF also accepted
; yields LATIN SMALL LETTER Y WITH DIAERESIS
;(assert `(= (string-length "\XFF")   1))


(test! "3 digit hex escape x414 yields two characters")
; This is the current behavior.
; SF parses only two hex digits as part of the hex escape,
; and the third hex digit is parsed as itself.
; FUTURE parse a max of four bytes of hex, like say Racket
; yields A4
(assert `(= (string-length "\x414")   2))

; FUTURE: Now does not accept
;(test! "3 digit hex escape")
; yields one unnamed char, 3 bytes
;(assert `(= (string-length "\xfff")   1))

;(test! "4 digit hex escape")
; yields unnamed char, 3 bytes
;(assert `(= (string-length "\xffff")   1))

;(test! "5 digit hex escape")
; yields 2 chars, the unnamed char, 3 bytes,
; and the char LOWER CASE F, 1 byte
;(assert `(= (string-length "\xfffff")   1))


; Every four digit hex value is a valid codepoint
; meaning it will encode in UTF-8.
; Whether it displays a visible glyph depends on other factors.


(test! "consecutive escape sequences")


(test! "consecutive hex escapes")
; two A chars
(assert `(= (string-length "\x41\x41")   2))

; FIXME fails
;(test! "consecutive hex escapes")
; two CENT chars
;(assert `(= (string-length "\xa2\xa2")   2))

; FIXME fails
; (test! "consecutive octal escapes")
; two CENT chars
; (assert `(= (string-length "\242\242")   2))

(test! "consecutive escaped backslash and hex escape")
; yields 3 characters: BACKSLASH, A, BACKSLASH,
(assert `(= (string-length "\\\x41\\")   3))

; FIXME fails
;(test! "consecutive escaped backslash and hex escape")
; yields 3 characters: BACKSLASH, CENT, BACKSLASH,
;(assert `(= (string-length "\\\xa2\\")   3))