;;;; utf8.scm -- Unicode support for Chicken ;; ;; Copyright (c) 2004 Alex Shinn ;; All rights reserved. ;; ;; BSD-style license: http://www.debian.org/misc/bsd.license ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; USAGE ;; ;; To make your code Unicode aware, just add the following: ;; ;; (require 'utf8) ;; ;; then all core, extra, regex and SRFI-13 string operations will be ;; Unicode aware. string-length will return the number of codepoints, ;; not the number of bytes, string-ref will index by codepoints and ;; return a char with an integer value up to 2^21, regular expressions ;; will match single codepoints rather than bytes and understand Unicode ;; character classes, etc. ;; ;; Strings are still native strings and may be passed to external ;; libraries (either Scheme or foreign) perfectly safely. Libraries ;; that do parsing invariably do so on ASCII character boundaries and ;; are thus guaranteed to be compatible. Libraries that reference ;; strings by index would need to be modified with a UTF-8 version. ;; Currently all existing eggs are UTF-8 safe to my knowledge. ;; ;; LIMITATIONS ;; ;; string-set! returns the string modified, and in the unusual case of ;; writing over a char with a new char it will allocate a new string and ;; return that. Thus it is not guaranteed to modify in place, and the ;; new idiom becomes ;; ;; (set! str (string-set! str i ch)) ;; ;; much like remove! and delete!. This is not compatible with non-utf8 ;; code since the core string-set! doesn't return the string, so if you ;; want to use string-set! you can't just [un]comment the (require ;; 'utf8) to toggle between Unicode mode. However you probably don't ;; need to ever use string-set!. ;; ;; string-fill! and string-copy! are likewise modified to return a ;; string and are not guaranteed to work in place. ;; ;; Regular expressions accept Unicode character classes and ASCII ;; character ranges, but currently does not accept Unicode character ;; ranges. ;; ;; PERFORMANCE ;; ;; string-length, string-ref and string-set! are all O(n) operations as ;; opposed to the usual O(1) since UTF-8 is a variable width encoding. ;; Use of these should be discouraged - it is much cleaner to use the ;; high-level SRFI-13 procedures and string ports. For examples of how ;; to do common idioms without these procedures look at any string-based ;; code in Gauche. ;; ;; string?, string=?, string-append, all R5RS string comparisons, and ;; read-line are unmodified. ;; ;; Regular expression matching will be just as fast except in the case ;; of Unicode character classes. ;; ;; All other procedures incur zero to minor overhead, but keep the same ;; asymptotic performance. ;; ;; BYTE-STRINGS ;; ;; Sometimes you may need access to the original string primitives so ;; you can directly access bytes, such as if you were implementing your ;; own regex library or text buffer and wanted optimal performance. For ;; these cases we have renamed the original primitives by replacing ;; "string" with "byte-string". Thus byte-string-length is the length ;; in bytes, not characters, of the strings (the equivalent of Gauche's ;; string-size). byte-string-set! can corrupt the UTF-8 encoding and ;; should be used sparingly if at all. ;; ;; DISCUSSION ;; ;; There are two ways to add Unicode string support to an existing ;; language: redefine the strings themselves (i.e. add a new string ;; type), or redefine the operations on the strings. The former causes ;; a schism in your string libraries, dividing them between ;; Unicode-aware and not, either doubling your library implementations ;; or limiting them to one type or the other. You can't freely pass ;; strings to other libraries without keeping track of their types and ;; converting when needed. It becomes slow and unwieldy. Perl is the ;; only language I know of who seriously tried this, and the modules ;; which worked with Unicode strings were minimal, frequent type ;; conversions were needed, a general mess ensued, and Perl very quickly ;; switched to the latter approach. ;; ;; UTF-8 is ideal for this sort of extension because it is "backwards ;; compatible" with ASCII. Any ASCII (7-bit) byte found within a UTF-8 ;; string is guaranteed to be that character, not part of a multibyte ;; character, so parsing libraries that work on ASCII characters work ;; unmodified. This includes most existing text formats and network ;; protocols. The EUC (Extended Unix Code) encodings also have this ;; feature so a similar module could be implemented allowing users to ;; (require 'euc-jp) for example and work in Japanese EUC rather than ;; Unicode. Other encodings such as Shift_JIS satisfy the requirement ;; that an ASCII string has the same meaning in the encoding, but ;; multibyte characters in the encoding may include ASCII bytes, ;; breaking the rule we need for safe ASCII parsing. A few encodings ;; like UTF-16 and UTF-32 are completely incompatible. UTF-16 is ;; primarily only used these days by Java, a victim of the unfortunate ;; fact that at first UTF-16 was fixed with but is no longer with the ;; advent of surrogate pairs. Note that even without this module you ;; can write source code in Chicken in any ASCII compatible encoding ;; like ISO-8859-* or UTF-8 and define symbols with that encoding ;; (letting you replace lambda with syntax for a real greek lambda, for ;; example). ;; ;; Other languages that use UTF-8 include Perl, Python, TCL. XML and ;; increasingly more and more network standards are using UTF-8 by ;; default, and major databases all support UTF-8. Libraries with UTF-8 ;; support include Gtk, SDL, and freetype. ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (cond-expand ((or chicken-compile-shared shared) ) (else (declare (unit utf8)))) (declare (not standard-bindings string-ref string-set! string-length string substring make-string string->list list->string string-fill! write-char read-char display) (export ;; byte-level access byte-string-length byte-string-ref byte-string-set! make-byte-string byte-string byte-substring byte-string->list list->byte-string byte-string-fill! write-byte-char read-byte-char byte-display ;; R5RS string-length string-ref string-set! make-string string substring string->list list->string string-fill! write-char read-char display ;; library reverse-list->string print print* ;; extras read-string write-string read-token ->string conc string-chop string-split string-translate substring=? substring-ci=? substring-index substring-index-ci ;; regexp grep regexp string-substitute string-substitute* string-split-fields string-match string-match-positions string-match-offsets string-search string-search-positions string-search-offsets ;; srfi-13 string-every string-any string-copy string-copy! substring/shared string-take string-take-right string-drop string-drop-right string-pad string-pad-right string-trim string-trim-right string-trim-both string-compare string-compare-ci string-hash string-hash-ci string= string<> string< string> string<= string>= string-ci= string-ci<> string-ci< string-ci> string-ci<= string-ci>= string-prefix? string-prefix-ci? string-prefix-length string-prefix-length-ci string-suffix? string-suffix-ci? string-suffix-length string-suffix-length-ci string-index string-index-right string-skip string-skip-right string-count string-contains string-contains-ci string-fold string-fold-right string-unfold string-unfold-right string-map string-map! string-for-each string-for-each-index xsubstring string-xcopy! string-replace string-tokenize string-filter string-delete )) (require-extension extras srfi-13 regex) ;; internal byte string operations (define byte-string-length string-length) (define byte-string-ref string-ref) (define byte-string-set! string-set!) (define make-byte-string make-string) (define byte-string string) (define byte-substring substring) (define byte-string->list string->list) (define list->byte-string list->string) ;; utilities which take and return integers (define (string-int-ref s i) (char->integer (byte-string-ref s i))) (define (string-int-set! s i c) (byte-string-set! s i (integer->char c))) ;; determine if a string only has 7-bit ASCII characters (define (ascii-string? str) (let ((size (byte-string-size str))) (let loop ((i 0)) (if (= i size) #t (and (> 128 (string-int-ref str i)) (loop (+ i 1))))))) ;; from SRFI-33, useful in splitting up the bit patterns used to ;; represent unicode values in utf8 (define (extract-bit-field size position n) (bitwise-and (bitwise-not (arithmetic-shift -1 size)) (arithmetic-shift n (- position)))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; indexing utils ;; number of total bytes in a utf8 char given the 1st byte (define utf8-start-byte->length (let ((table '#( 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 0x 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 1x 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 2x 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 3x 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 4x 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 5x 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 6x 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 7x 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 8x 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 9x 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; ax 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; bx 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ; cx 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ; dx 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 ; ex 4 4 4 4 4 4 4 4 5 5 5 5 6 6 0 0 ; fx ))) (lambda (i) (vector-ref table i)))) (define (ucs-integer->length x) (cond ((<= x #x7F) 1) ((<= x #x7FF) 2) ((<= x #xFFFF) 3) ((<= x #x1FFFFF) 4) ((<= x #x3FFFFFF) 5) ((<= x #x7FFFFFFF) 6) (else (error "unicode codepoint out of range:" x)))) (define (utf8-index->offset s size pos) (let loop ((i 0) (count 0)) (cond ((>= i size) (error "index out of range" s pos)) ((= count pos) i) (else (loop (+ i (utf8-start-byte->length (string-int-ref s i))) (+ count 1)))))) (define (utf8-offset->index s size off) (let loop ((i 0) (count 0)) (cond ((>= i size) (error "index out of range" s off)) ((= i off) pos) (else (loop (+ i (utf8-start-byte->length (string-int-ref s i))) (+ count 1)))))) (define (utf8-prev-char s off) (let loop ((i (- off 1))) (if (= #b10000000 (bitwise-and #b11000000 (string-int-ref s i))) (loop (- i 1)) i))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; redefine primitives (define (string-length s) (let ((size (byte-string-length s))) (let loop ((i 0) (res 0)) (if (>= i size) res (loop (+ i (utf8-start-byte->length (string-int-ref s i))) (+ res 1)))))) ;; like STRING but only takes one argument (define (char->string c) (let* ((c-i (char->integer c)) (len (ucs-integer->length c-i))) (if (<= len 1) (byte-string c) (let ((res (make-byte-string len))) (string-set-at-byte-in-place! res len len 0 c-i))))) (define (string . args) (list->string args)) (define (with-substring-offsets proc s start . opt) (let* ((size (byte-string-length s)) (b1 (utf8-index->offset s size start))) (if (pair? opt) (let ((end (car opt))) (let loop ((b2 b1) (count start)) (cond ((> b2 size) (error "index out of range" s end)) ((= count end) (proc s b1 b2)) (else (loop (+ b2 (utf8-start-byte->length (string-int-ref s b2))) (+ count 1)))))) (proc s b1 size)))) (define (substring s start . opt) (apply with-substring-offsets byte-substring s start opt)) (define (make-string len . opt) (if (pair? opt) (let* ((c (car opt)) (c-i (char->integer c)) (c-len (ucs-integer->length c-i))) (if (<= c-len 1) (make-byte-string len c) (let* ((size (* len c-len)) (res (make-byte-string size))) (let loop ((i 0)) (if (>= i size) res (begin (string-set-at-byte-in-place! res size c-len i c-i) (loop (+ i c-len)))))))) (make-byte-string len))) (define (string->list str) (let ((size (byte-string-length str))) (let loop ((i 0) (res '())) (if (>= i size) (reverse res) (loop (+ i (utf8-start-byte->length (string-int-ref str i))) (cons (string-ref-at-byte str size i) res)))))) (define (list->string ls) (apply string-append (map char->string ls))) (define (string-fill! str c) (let* ((size (byte-string-length str)) (len (string-length str)) (c-i (char->integer c)) (c-len (ucs-integer->length c-i)) (needed (* c-len len))) (if (= needed size) (let ((c-str (char->string c))) (do ((i 0 (+ i c-len))) ((= i size) str) (string-set-at-byte-in-place! str size len i c-i))) (make-string len c)))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; accessors (define (string-ref-at-byte s size byte) (let* ((c (string-int-ref s byte)) (len (utf8-start-byte->length c))) (if (<= len 1) (integer->char c) (let ((end (+ byte len))) (if (> end size) (error "utf8 trailing char overflow" s byte) (let loop ((i (+ byte 1)) (res (extract-bit-field (- 7 len) 0 c))) (if (= i end) (integer->char res) (loop (+ i 1) (bitwise-ior (arithmetic-shift res 6) (bitwise-and #b00111111 (string-int-ref s i))))))))))) (define (string-ref s pos) (let ((size (byte-string-length s))) (string-ref-at-byte s size (utf8-index->offset s size pos)))) (define (string-set-at-byte-in-place! s size len byte val-i) (let ((end (+ byte len))) (cond ((> end size) (error "utf8 trailing char overflow" s byte)) ((<= len 1) (string-int-set! s byte val-i)) (else (let* ((tag (- (expt 2 len) 1)) (tag-shift (arithmetic-shift tag (- 8 len))) (body (extract-bit-field (- 7 len) (* 6 (- len 1)) val-i)) (b1 (bitwise-ior tag-shift body))) (string-int-set! s byte b1)) (let loop ((i 1)) (unless (= i len) (let ((b (bitwise-ior #b10000000 (extract-bit-field 6 (* 6 (- len i 1)) val-i)))) (string-int-set! s (+ byte i) b) (loop (+ i 1))))))) s)) (define (string-set-at-byte! s size byte val) (let* ((c (string-int-ref s byte)) (len (utf8-start-byte->length c)) (val-i (char->integer val)) (val-len (ucs-integer->length val-i))) (if (not (= len val-len)) ;; different size, allocate & return new string (let ((s1 (byte-substring s 0 byte)) (s2 (string val)) (s3 (byte-substring s (+ byte len) size))) (string-append s1 s2 s3)) ;; modify in place (string-set-at-byte-in-place! s size len byte val-i)))) (define (string-set! s pos val) (let ((size (byte-string-length s))) (string-set-at-byte! s size (utf8-index->offset s size pos) val))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Basic I/O (define write-byte-char write-char) (define read-byte-char read-char) (define byte-display display) (define (write-char c . opt) (apply display (char->string c) opt)) (define (read-char . opt) (let* ((p (get-optional opt (current-input-port))) (b1 (read-byte p))) (if (eof-object? b1) b1 (let ((len (utf8-byte-count b1))) (if (<= len 1) b1 (let loop ((res (extract-bit-field (- 6 len) 0 b1)) (i (- len 1))) (if (zero? i) res (let ((b2 (read-byte p))) (cond ((eof-object? b2) b2) ((not (= #b10 (extract-bit-field 2 6 b2))) (error "invalid utf8 sequence")) (else (loop (bitwise-ior (arithmetic-shift res 6) (bitwise-and #b00111111 b2)) (- i 1)))))))))))) (define (display x . opt) (apply byte-display (if (char? x) (char->string x) x) opt)) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; library (define reverse-list->byte-string reverse-list->string) (define byte-print print) (define byte-print* print*) (define (reverse-list->string ls) (list->string (reverse ls))) (define (print . args) (apply print (map (lambda (x) (if (char? x) (char->string x) x) args)))) (define (print* . args) (apply print* (map (lambda (x) (if (char? x) (char->string x) x) args)))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; I/O extras (define read-byte-string read-string) (define write-byte-string write-string) (define read-byte-token read-token) ;; this could be optimized by reading bytes while counting characters, ;; instead of counting characters (define (read-string . opt) (let-optionals* opt ((num #f) (in (current-input-port))) (if num (let loop ((i 0) (acc '())) (if (>= i num) (list->string (reverse acc)) (let ((ch (read-char in))) (if (eof-object? ch) (loop num acc) (loop (+ i 1) (cons ch acc)))))) (read-byte-string num in)))) (define (write-string str . opt) (let-optionals* opt ((num #f) (out (current-output-port))) (if (and num (< num (string-length str))) (byte-display (substring str 0 num) out) (byte-display str out)))) (define (read-token pred . opt) (let ((in (if (pair? opt) (car opt) (current-input-port)))) (let loop ((acc '())) (let ((ch (read-char in))) (if (or (eof-object? ch) (not (pred ch))) (list->string (reverse acc)) (loop (cons ch acc))))))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; string extras (define ->byte-string string) (define byte-conc conc) (define byte-string-chop string-chop) (define byte-string-split string-split) (define byte-string-translate string-translate) (define byte-substring=? substring=?) (define byte-substring-ci=? substring-ci=?) (define byte-substring-index substring-index) (define byte-substring-index-ci substring-index-ci) (define (->string x) (if (char? x) (char->string x) (->byte-string x))) (define (conc . args) (apply string-append (map ->string args))) (define (string-chop str len) (let ((size (byte-string-length str))) (let loop ((i 0) (from 0) (off 0) (acc '())) (cond ((>= off size) (if (< off size) (reverse (cons (byte-substring str off size) acc)) (reverse acc))) ((= i len) (loop 0 off off (cons (byte-substring str from off) acc))) (else (loop (+ i 1) from (+ off (utf8-start-byte->length (string-int-ref str off))) acc)))))) (define (string-split str . opt) (let-optionals* opt ((delim #f) (keep-empty? #f)) (if (or (not delim) (ascii-string? delim)) (byte-string-split str delim keep-empty?) (let ((delims (string->list delim)) (join (if keep-empty? (lambda (cur acc) (cons (list->string (reverse cur)) acc)) (lambda (cur acc) (if (null? cur) acc (cons (list->string (reverse cur)) acc)))))) (let loop ((ls (string->list str)) (cur '()) (acc '())) (cond ((null? ls) (reverse (join cur acc))) ((memv (car ls) delims) (loop (cdr ls) '() (join cur acc))) (else (loop (cdr ls) (cons (car ls) cur) acc)))))))) (define (string->vector str) (list->vector (string->list str))) (define (vector-memv x vec) (let ((len (vector-length vec))) (let loop ((i 0)) (cond ((= i len) #f) ((eqv? x (vector-ref vec i)) i) (else (loop (+ i 1))))))) (define (string-translate str from . opt) (let* ((from-vec (string->vector from))) (if (pair? opt) (let ((to-vec (string->vector (car opt)))) (string-map (lambda (c) (cond ((vector-memv c from-vec) => (lambda (i) (vector-ref to-vec i))) (else c))) str)) (string-delete str (lambda (c) (vector-memv c from-vec)))))) (define (substring=? s1 s2 . opt) (let ((s1-len (string-length s1)) (s2-len (string-length s2))) (let-optionals* opt ((start1 0) (start2 0) (len (min (- s1-len start1) (- s2-len start2)))) (with-substring-offsets (lambda (s1 s1-start s1-end) (with-substring-offsets (lambda (s2 s2-start s2-end) (byte-substring=? s1 s2 s1-start s2-start (- s1-end s1-start))) s2 start2 (+ start2 len))) s1 start1 (+ start1 len))))) (define (substring=? s1 s2 . opt) (let ((s1-len (string-length s1)) (s2-len (string-length s2))) (let-optionals* opt ((start1 0) (start2 0) (len (min (- s1-len start1) (- s2-len start2)))) (with-substring-offsets (lambda (s1 s1-start s1-end) (with-substring-offsets (lambda (s2 s2-start s2-end) (byte-substring-ci=? s1 s2 s1-start s2-start (- s1-end s1-start))) s2 start2 (+ start2 len))) s1 start1 (+ start1 len))))) (define (substring-index which where . opt) (apply string-contains where which opt)) (define (substring-index-ci which where . opt) (apply string-contains-ci where which opt)) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; regexps (define byte-grep grep) (define byte-regexp regexp) (define byte-string-match string-match) (define byte-string-match-positions string-match-positions) (define byte-string-search string-search) (define byte-string-search-positions string-search-positions) (define byte-string-split-fields string-split-fields) (define byte-string-substitute string-substitute) (define byte-string-substitute* string-substitute*) (define pat-utf8 (let ((pat-ascii "[\\000-\\177]") (pat-utf8-2 "[\\302-\\337][\\200-\\301]") (pat-utf8-3 "[\\340-\\357][\\200-\\301]{2}")) (reverse (byte-string->list (string-append "(?:" pat-ascii "|" pat-utf8-2 "|" pat-utf8-3 ")"))))) (define (utf8-pattern->byte-pattern str) (let ((size (byte-string-length str))) (define (scan i res) (if (= i size) (list->byte-string (reverse res)) (let ((c (byte-string-ref str i))) (case c ((#\.) ;; XXXX optimize the .+ cases (if (and (< (+ i 1) size) (eqv? #\* (byte-string-ref str (+ i 1)))) (scan (+ i 2) (cons #\* (cons #\. res))) (scan (+ i 1) (append pat-utf8 res)))) ((#\\) (scan (+ i 2) (cons (byte-string-ref str (+ i 1)) (cons #\\ res)))) ((#\[) (class (+ i 1) '() #t res)) (else (scan (+ i 1) (cons c res))))))) (define (class->group ls) (let loop ((ls ls) (acc '())) (if (null? ls) (byte-string->list (string-append "(?:" (string-intersperse acc "|") ")")) (let ((c (car ls)) (rst (cdr ls))) (if (and (pair? rst) (eqv? #\- (car rst))) (let ((end (cadr rst))) (if (or (> (char->integer c) 128) (> (char->integer end) 128)) (error "full unicode classes not supported") (loop (cddr rst) (cons (string-append "[" (char->string end) "-" (char->string c) "]") acc)))) (loop (cdr ls) (cons (char->string c) acc))))))) (define (class i acc ascii? res) (if (= i size) (error "incomplete character class") (let ((c (byte-string-ref str i))) (case c ((#\]) (if ascii? (scan (+ i 1) (cons #\] (append acc (cons #\[ res)))) (scan (+ i 1) (append (reverse (class->group acc)) res)))) ((#\\) (let ((next (byte-string-ref str (+ i 1)))) (if (< (char->integer next) 128) (class (+ i 2) (cons next (cons #\\ acc)) ascii? res) (class (+ i 1) (cons #\\ acc) ascii? res)))) (else (if (< (char->integer c) 128) (class (+ i 1) (cons c acc) ascii? res) (class (+ i (utf8-start-byte->length (char->integer c))) (cons (string-ref-at-byte str size i) acc) #f res))))))) (scan 0 '()))) (define (regexp str) (byte-regexp (utf8-pattern->byte-pattern str))) (define (->rx x) (if (regexp? x) x (regexp x))) (define (opt-off s o) (if (pair? o) (utf8-index->offset s (byte-string-length s) (car o)) 0)) (define (grep rx ls) (byte-grep (->rx rx) ls)) (define (string-match rx str . o) (byte-string-match (->rx rx) str (opt-off str o))) (define (string-match-offsets rx str . o) (byte-string-match-positions (->rx rx) str (opt-off str o))) (define (string-search rx str . o) (let* ((start (opt-off str o)) (range (if (and (pair? o) (pair? (cdr o))) (opt-off str (cdr o)) (- (byte-string-length str) start)))) (byte-string-search (->rx str) str start range))) (define (string-search-offsets) (let* ((start (opt-off str o)) (range (if (and (pair? o) (pair? (cdr o))) (opt-off str (cdr o)) (- (byte-string-length str) start)))) (byte-string-search-positions (->rx str) str start range))) (define (string-split-fields rx str . o) (let-optionals* o ((mode #t) o2) (let ((start (opt-off str o2))) (byte-string-split-fields (->rx rx) str mode start)))) (define (string-substitute rx subst str . o) (apply byte-string-substitute (->rx rx) subst str o)) (define (string-substitute* str smap) (byte-string-substitute* str (map (lambda (x) (cons (->rx (car x)) (cdr x))) smap))) ;; these could be a lot faster, but you don't want to be working with ;; positions anyway (define (string-match-positions rx str . opt) (let* ((size (string-size str)) (->pos (lambda (o) (utf8-offset->index str size o)))) (map (lambda (x) (if (pair? x) (map ->pos x) x)) (apply string-match-offsets rx str opt)))) (define (string-search-positions rx str . opt) (let* ((size (string-size str)) (->pos (lambda (o) (utf8-offset->index str size o)))) (map (lambda (x) (if (pair? x) (map ->pos x) x)) (apply string-search-offsets rx str opt)))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; SRFI-13 (define byte-string-fold string-fold) (define byte-string-map string-map) (define byte-string-map! string-map!) (define byte-string-any string-any) (define byte-string-every string-every) (define byte-string-tabulate string-tabulate) (define byte-string-copy string-copy) (define byte-string-copy! string-copy!) (define byte-substring/shared substring/shared) (define byte-string-pad string-pad) (define byte-string-pad-right string-pad-right) (define byte-string-trim string-trim) (define byte-string-trim-right string-trim-right) (define byte-string-trim-both string-trim-both) (define byte-string-index string-index) (define byte-string-index-right string-index-right) (define byte-string-skip string-skip) (define byte-string-skip-right string-skip-right) (define byte-string-contains string-contains) (define byte-string-contains-ci string-contains-ci) (define byte-string= string=) (define byte-string<> string<>) (define byte-string< string<) (define byte-string> string>) (define byte-string<= string<=) (define byte-string>= string>=) (define byte-string-ci= string-ci=) (define byte-string-ci<> string-ci<>) (define byte-string-ci< string-ci<) (define byte-string-ci> string-ci>) (define byte-string-ci<= string-ci<=) (define byte-string-ci>= string-ci>=) (define (opt-substring s opt) (if (null? opt) (byte-string-copy s) (let ((start (car opt)) (opt2 (cdr opt))) (if (null? opt2) (byte-substring s (utf8-index->offset s (byte-string-length s) start)) (substring s start (car opt2)))))) (define (opt-substring/shared s opt) (if (null? opt) s (let ((start (car opt)) (opt2 (cdr opt))) (if (null? opt2) (byte-substring/shared s (utf8-index->offset s (byte-string-length s) start)) (substring/shared s start (car opt2)))))) (define (string-fold kons knil s . opt) (let-optionals* opt ((start 0) (end (string-length s))) (let ((size (byte-string-length s))) (let loop ((i start) (b (utf8-index->offset s size start)) (acc knil)) (if (>= i end) acc (loop (+ i 1) (+ b (utf8-start-byte->length (string-int-ref s b))) (kons (string-ref-at-byte s size b) acc))))))) (define (string-map proc s . opt) (list->string (reverse (apply string-fold (lambda (c acc) (cons (proc c) acc)) '() s opt)))) (define string-map! string-map) (define (string-for-each proc s . opt) (apply string-fold (lambda (c acc) (proc c)) #f s opt)) (define (string-for-each-index proc s . opt) (let ((start (if (pair? opt) (car opt) 0)) (end (if (and (pair? opt) (pair? (cdr opt))) (cadr opt) (string-length s)))) (do ((i start (+ i 1))) ((= i end)) (proc i)))) (define (char-predicate x) (cond ((procedure? x) x) ((char? x) (lambda (c) (eqv? c x))) ((char-set? x) (lambda (c) (char-set-contains? x c))) (else (error "unknown predicate" x)))) (define (string-any x str . opt) (let ((pred (char-predicate x))) (call-with-current-continuation (lambda (return) (apply string-fold (lambda (c acc) (cond ((pred c) => return) (else #f))) #f str opt))))) (define (string-every x str . opt) (let ((pred (char-predicate x))) (call-with-current-continuation (lambda (return) (apply string-fold (lambda (c acc) (cond ((pred c) => identity) (else (return #f)))) #f str opt))))) (define (string-tabulate proc len) (let loop ((i 0) (acc '())) (if (= i len) (reverse-list->string acc) (loop (+ i 1) (cons (proc i) acc))))) (define (string-copy s . opt) (if (null? opt) (byte-string-copy s) (apply substring s opt))) (define (substring/shared s start . opt) (apply with-substring-offsets byte-substring/shared s start opt)) (define (string-copy! target tstart str . opt) (let* ((s (if (pair? opt) (apply substring str opt) s)) (s-size (byte-string-length s)) (t-total-size (byte-string-length target)) (t-off (utf8-index->offset target t-total-size tstart)) (t-size (- t-total-size t-off))) (if (= s-size t-size) (begin (byte-string-copy! target t-off s) target) s))) (define (string-take s n) (substring s 0 n)) (define (string-drop s n) (substring s n)) (define (string-take-right s n) (substring s (- (string-length s) n))) (define (string-drop-right s n) (substring s 0 (- (string-length s) n))) (define (make-string-padder proc) (lambda (s len . opt) (let-optionals* opt ((ch #\space) (start 0) (end (string-length s))) (let ((diff (- end start))) (if (<= len diff) (substring/shared s start (+ start len)) (proc (substring/shared s start end) (make-string (- len diff) ch))))))) (define string-pad (make-string-padder (lambda (a b) (string-append b a)))) (define string-pad-right (make-string-padder string-append)) (define (string-trim s . opt) (let-optionals* opt ((x #f) rest) (let ((s2 (opt-substring/shared s rest))) (if x (byte-substring/shared s2 (string-offset s2 x)) (byte-string-trim s2))))) (define (string-trim-right s . opt) (let-optionals* opt ((x #f) rest) (let ((s2 (opt-substring/shared s rest))) (if x (byte-substring/shared s2 0 (string-offset-right s2 x)) (byte-string-trim-right s2))))) (define (string-trim-both s . opt) (let-optionals* opt ((x #f) rest) (let ((s2 (opt-substring/shared s rest))) (if x (string-trim x (string-trim-right x s2)) (byte-string-trim-both s2))))) (define (with-string-index+offset proc s x . opt) (let-optionals* opt ((start 0) (end -1)) (let ((size (string-size s)) (pred (char-predicate x))) (let loop ((i start) (off (utf8-index->offset s size start))) (if (or (= i end) (= off size)) (proc #f #f) (let ((ch (string-ref-at-byte s size off))) (if (pred ch) (proc i off) (loop (+ i 1) (+ off (ucs-integer->length (char->integer ch))))))))))) (define (with-string-index+offset-right s x . opt) (let-optionals* opt ((start 0) (end (string-length s))) (let ((size (string-size s)) (pred (char-predicate x))) (let loop ((i (- end 1)) (off (utf8-index->offset s size (- end 1)))) (if (< i start) (proc #f #f) (let ((ch (string-ref-at-byte s size off))) (if (pred ch) (proc i off) (loop (- i 1) (utf8-prev-char s off))))))))) (define (arg1 a b) a) (define (arg2 a b) b) (define (string-index s x . opt) (apply with-string-index+offset arg1 s x opt)) (define (string-offset s x . opt) (apply with-string-index+offset arg2 s x opt)) (define (string-index-right s x . opt) (apply with-string-index+offset-right arg1 s x opt)) (define (string-offset s x . opt) (apply with-string-index+offset-right arg2 s x opt)) (define (string-skip s x . opt) (apply string-index (complement (char-predicate x)))) (define (string-skip-right s x . opt) (apply string-index-right (complement (char-predicate x)))) (define (string-count s x . opt) (let ((pred (char-predicate x))) (apply string-fold (lambda (c sum) (if (pred c) (+ sum 1) sum)) 0 s opt))) (define (with-two-substrings proc s1 s2 opt) (proc (opt-substring/shared s1 opt) (opt-substring/shared s2 (or (and (pair? (opt)) (pair? (cdr opt)) (cddr opt)) '())))) (define (string-compare s1 s2 proc< proc= proc> . opt) (with-two-substrings (lambda (s1 s2) (byte-string-compare s1 s2 proc< proc= proc>)) s1 s2 opt)) (define (string-compare-ci s1 s2 proc< proc= proc> . opt) (with-two-substrings (lambda (s1 s2) (byte-string-compare-ci s1 s2 proc< proc= proc>)) s1 s2 opt)) (define (make-string-comparator proc) (lambda (s1 s2 . opt) (with-two-substrings proc s1 s2 opt))) (define string= (make-string-comparator byte-string=)) (define string<> (make-string-comparator byte-string<>)) (define string< (make-string-comparator byte-string<)) (define string> (make-string-comparator byte-string>)) (define string<= (make-string-comparator byte-string<=)) (define string>= (make-string-comparator byte-string>=)) (define string-ci= (make-string-comparator byte-string-ci=)) (define string-ci<> (make-string-comparator byte-string-ci<>)) (define string-ci< (make-string-comparator byte-string-ci<)) (define string-ci> (make-string-comparator byte-string-ci>)) (define string-ci<= (make-string-comparator byte-string-ci<=)) (define string-ci>= (make-string-comparator byte-string-ci>=))