scsh-0.6/scheme/sort/lmsort.scm

387 lines
15 KiB
Scheme

;;; list merge & list merge-sort -*- Scheme -*-
;;; Copyright (c) 1998 by Olin Shivers.
;;; This code is open-source; see the end of the file for porting and
;;; more copyright information.
;;; Olin Shivers
;;; Exports:
;;; (list-merge < lis lis) -> list
;;; (list-merge! < lis lis) -> list
;;; (list-merge-sort < lis) -> list
;;; (list-merge-sort! < lis) -> list
;;; A stable list merge sort of my own device
;;; Two variants: pure & destructive
;;;
;;; This list merge sort is opportunistic (a "natural" sort) -- it exploits
;;; existing order in the input set. Instead of recursing all the way down to
;;; individual elements, the leaves of the merge tree are maximal contiguous
;;; runs of elements from the input list. So the algorithm does very well on
;;; data that is mostly ordered, with a best-case time of O(n) when the input
;;; list is already completely sorted. In any event, worst-case time is
;;; O(n lg n).
;;;
;;; The destructive variant is "in place," meaning that it allocates no new
;;; cons cells at all; it just rearranges the pairs of the input list with
;;; SET-CDR! to order it.
;;;
;;; The interesting control structure is the combination recursion/iteration
;;; of the core GROW function that does an "opportunistic" DFS walk of the
;;; merge tree, adaptively subdividing in response to the length of the
;;; merges, without requiring any auxiliary data structures beyond the
;;; recursion stack. It's actually quite simple -- ten lines of code.
;;; -Olin Shivers 10/20/98
;;; (mlet ((var-list mv-exp) ...) body ...)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; A LET* form that handles multiple values. Move this into the two clients
;;; if you don't have a module system handy to restrict its visibility...
(define-syntax mlet ; Multiple-value LET*
(syntax-rules ()
((mlet ((() exp) rest ...) body ...)
(begin exp (mlet (rest ...) body ...)))
((mlet (((var) exp) rest ...) body ...)
(let ((var exp)) (mlet (rest ...) body ...)))
((mlet ((vars exp) rest ...) body ...)
(call-with-values (lambda () exp)
(lambda vars (mlet (rest ...) body ...))))
((mlet () body ...) (begin body ...))))
;;; (list-merge-sort < lis)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; A natural, stable list merge sort.
;;; - natural: picks off maximal contiguous runs of pre-ordered data.
;;; - stable: won't invert the order of equal elements in the input list.
(define (list-merge-sort elt< lis)
;; (getrun lis) -> run runlen rest
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Pick a run of non-decreasing data off of non-empty list LIS.
;; Return the length of this run, and the following list.
(define (getrun lis)
(let lp ((ans '()) (i 1) (prev (car lis)) (xs (cdr lis)))
(if (pair? xs)
(let ((x (car xs)))
(if (elt< x prev)
(values (append-reverse ans (cons prev '())) i xs)
(lp (cons prev ans) (+ i 1) x (cdr xs))))
(values (append-reverse ans (cons prev '())) i xs))))
(define (append-reverse rev-head tail)
(let lp ((rev-head rev-head) (tail tail))
(if (null-list? rev-head) tail
(lp (cdr rev-head) (cons (car rev-head) tail)))))
(define (null-list? l)
(cond ((pair? l) #f)
((null? l) #t)
(else (error "null-list?: argument out of domain" l))))
;; (merge a b) -> list
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; List merge -- stably merge lists A (length > 0) & B (length > 0).
;; This version requires up to |a|+|b| stack frames.
(define (merge a b)
(let recur ((x (car a)) (a a)
(y (car b)) (b b))
(if (elt< y x)
(cons y (let ((b (cdr b)))
(if (pair? b)
(recur x a (car b) b)
a)))
(cons x (let ((a (cdr a)))
(if (pair? a)
(recur (car a) a y b)
b))))))
;; (grow s ls ls2 u lw) -> [a la unused]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; The core routine. Read the next 20 lines of comments & all is obvious.
;; - S is a sorted list of length LS > 1.
;; - LS2 is some power of two <= LS.
;; - U is an unsorted list.
;; - LW is a positive integer.
;; Starting with S, and taking data from U as needed, produce
;; a sorted list of *at least* length LW, if there's enough data
;; (LW <= LS + length(U)), or use all of U if not.
;;
;; GROW takes maximal contiguous runs of data from U at a time;
;; it is allowed to return a list *longer* than LW if it gets lucky
;; with a long run.
;;
;; The key idea: If you want a merge operation to "pay for itself," the two
;; lists being merged should be about the same length. Remember that.
;;
;; Returns:
;; - A: The result list
;; - LA: The length of the result list
;; - UNUSED: The unused tail of U.
(define (grow s ls ls2 u lw) ; The core of the sort algorithm.
(if (or (<= lw ls) (not (pair? u))) ; Met quota or out of data?
(values s ls u) ; If so, we're done.
(mlet (((ls2) (let lp ((ls2 ls2))
(let ((ls2*2 (+ ls2 ls2)))
(if (<= ls2*2 ls) (lp ls2*2) ls2))))
;; LS2 is now the largest power of two <= LS.
;; (Just think of it as being roughly LS.)
((r lr u2) (getrun u)) ; Get a run, then
((t lt u3) (grow r lr 1 u2 ls2))) ; grow it up to be T.
(grow (merge s t) (+ ls lt) ; Merge S & T,
(+ ls2 ls2) u3 lw)))) ; and loop.
;; Note: (LENGTH LIS) or any constant guaranteed
;; to be greater can be used in place of INFINITY.
(if (pair? lis) ; Don't sort an empty list.
(mlet (((r lr tail) (getrun lis)) ; Pick off an initial run,
((infinity) #o100000000) ; then grow it up maximally.
((a la v) (grow r lr 1 tail infinity)))
a)
'()))
;;; (list-merge-sort! < lis)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; A natural, stable, destructive, in-place list merge sort.
;;; - natural: picks off maximal contiguous runs of pre-ordered data.
;;; - stable: won't invert the order of equal elements in the input list.
;;; - destructive, in-place: this routine allocates no extra working memory;
;;; it simply rearranges the list with SET-CDR! operations.
(define (list-merge-sort! elt< lis)
;; (getrun lis) -> runlen last rest
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Pick a run of non-decreasing data off of non-empty list LIS.
;; Return the length of this run, the last cons cell of the run,
;; and the following list.
(define (getrun lis)
(let lp ((lis lis) (x (car lis)) (i 1) (next (cdr lis)))
(if (pair? next)
(let ((y (car next)))
(if (elt< y x)
(values i lis next)
(lp next y (+ i 1) (cdr next))))
(values i lis next))))
;; (merge! a enda b endb) -> [m endm]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Destructively and stably merge non-empty lists A & B.
;; The last cons of A is ENDA. (The cdr of ENDA can be non-nil.)
;; the last cons of B is ENDB. (The cdr of ENDB can be non-nil.)
;;
;; Return the first and last cons cells of the merged list.
;; This routine is iterative & in-place: it runs in constant stack and
;; doesn't allocate any cons cells. It is also tedious but simple; don't
;; bother reading it unless necessary.
(define (merge! a enda b endb)
;; The logic of these two loops is completely driven by these invariants:
;; SCAN-A: (CDR PREV) = A. X = (CAR A). Y = (CAR B).
;; SCAN-B: (CDR PREV) = B. X = (CAR A). Y = (CAR B).
(letrec ((scan-a (lambda (prev x a y b) ; Zip down A until we
(cond ((elt< y x) ; find an elt > (CAR B).
(set-cdr! prev b)
(let ((next-b (cdr b)))
(if (eq? b endb)
(begin (set-cdr! b a) enda) ; Done.
(scan-b b x a (car next-b) next-b))))
((eq? a enda) (maybe-set-cdr! a b) endb) ; Done.
(else (let ((next-a (cdr a))) ; Continue scan.
(scan-a a (car next-a) next-a y b))))))
(scan-b (lambda (prev x a y b) ; Zip down B while its
(cond ((elt< y x) ; elts are < (CAR A).
(if (eq? b endb)
(begin (set-cdr! b a) enda) ; Done.
(let ((next-b (cdr b))) ; Continue scan.
(scan-b b x a (car next-b) next-b))))
(else (set-cdr! prev a)
(if (eq? a enda)
(begin (maybe-set-cdr! a b) endb) ; Done.
(let ((next-a (cdr a)))
(scan-a a (car next-a) next-a y b)))))))
;; This guy only writes if he has to. Called at most once.
;; Pointer equality rules; pure languages are for momma's boys.
(maybe-set-cdr! (lambda (pair val) (if (not (eq? (cdr pair) val))
(set-cdr! pair val)))))
(let ((x (car a)) (y (car b)))
(if (elt< y x)
;; B starts the answer list.
(values b (if (eq? b endb)
(begin (set-cdr! b a) enda)
(let ((next-b (cdr b)))
(scan-b b x a (car next-b) next-b))))
;; A starts the answer list.
(values a (if (eq? a enda)
(begin (maybe-set-cdr! a b) endb)
(let ((next-a (cdr a)))
(scan-a a (car next-a) next-a y b))))))))
;; (grow s ends ls ls2 u lw) -> [a enda la unused]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; The core routine.
;; - S is a sorted list of length LS > 1, with final cons cell ENDS.
;; (CDR ENDS) doesn't have to be nil.
;; - LS2 is some power of two <= LS.
;; - U is an unsorted list.
;; - LW is a positive integer.
;; Starting with S, and taking data from U as needed, produce
;; a sorted list of *at least* length LW, if there's enough data
;; (LW <= LS + length(U)), or use all of U if not.
;;
;; GROW takes maximal contiguous runs of data from U at a time;
;; it is allowed to return a list *longer* than LW if it gets lucky
;; with a long run.
;;
;; The key idea: If you want a merge operation to "pay for itself," the two
;; lists being merged should be about the same length. Remember that.
;;
;; Returns:
;; - A: The result list (not properly terminated)
;; - ENDA: The last cons cell of the result list.
;; - LA: The length of the result list
;; - UNUSED: The unused tail of U.
(define (grow s ends ls ls2 u lw)
(if (and (pair? u) (< ls lw))
;; We haven't met the LW quota but there's still some U data to use.
(mlet (((ls2) (let lp ((ls2 ls2))
(let ((ls2*2 (+ ls2 ls2)))
(if (<= ls2*2 ls) (lp ls2*2) ls2))))
;; LS2 is now the largest power of two <= LS.
;; (Just think of it as being roughly LS.)
((lr endr u2) (getrun u)) ; Get a run from U;
((t endt lt u3) (grow u endr lr 1 u2 ls2)) ; grow it up to be T.
((st end-st) (merge! s ends t endt))) ; Merge S & T,
(grow st end-st (+ ls lt) (+ ls2 ls2) u3 lw)) ; then loop.
(values s ends ls u))) ; Done -- met LW quota or ran out of data.
;; Note: (LENGTH LIS) or any constant guaranteed
;; to be greater can be used in place of INFINITY.
(if (pair? lis)
(mlet (((lr endr rest) (getrun lis)) ; Pick off an initial run.
((infinity) #o100000000) ; Then grow it up maximally.
((a enda la v) (grow lis endr lr 1 rest infinity)))
(set-cdr! enda '()) ; Nil-terminate answer.
a) ; We're done.
'())) ; Don't sort an empty list.
;;; Merge
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; These two merge procedures are stable -- ties favor list A.
(define (list-merge < a b)
(cond ((not (pair? a)) b)
((not (pair? b)) a)
(else (let recur ((x (car a)) (a a) ; A is a pair; X = (CAR A).
(y (car b)) (b b)) ; B is a pair; Y = (CAR B).
(if (< y x)
(let ((b (cdr b)))
(if (pair? b)
(cons y (recur x a (car b) b))
(cons y a)))
(let ((a (cdr a)))
(if (pair? a)
(cons x (recur (car a) a y b))
(cons x b))))))))
;;; This destructive merge does as few SET-CDR!s as it can -- for example, if
;;; the list is already sorted, it does no SET-CDR!s at all. It is also
;;; iterative, running in constant stack.
(define (list-merge! < a b)
;; The logic of these two loops is completely driven by these invariants:
;; SCAN-A: (CDR PREV) = A. X = (CAR A). Y = (CAR B).
;; SCAN-B: (CDR PREV) = B. X = (CAR A). Y = (CAR B).
(letrec ((scan-a (lambda (prev a x b y) ; Zip down A doing
(if (< y x) ; no SET-CDR!s until
(let ((next-b (cdr b))) ; we hit a B elt that
(set-cdr! prev b) ; has to be inserted.
(if (pair? next-b)
(scan-b b a x next-b (car next-b))
(set-cdr! b a)))
(let ((next-a (cdr a)))
(if (pair? next-a)
(scan-a a next-a (car next-a) b y)
(set-cdr! a b))))))
(scan-b (lambda (prev a x b y) ; Zip down B doing
(if (< y x) ; no SET-CDR!s until
(let ((next-b (cdr b))) ; we hit an A elt that
(if (pair? next-b) ; has to be
(scan-b b a x next-b (car next-b)) ; inserted.
(set-cdr! b a)))
(let ((next-a (cdr a)))
(set-cdr! prev a)
(if (pair? next-a)
(scan-a a next-a (car next-a) b y)
(set-cdr! a b)))))))
(cond ((not (pair? a)) b)
((not (pair? b)) a)
;; B starts the answer list.
((< (car b) (car a))
(let ((next-b (cdr b)))
(if (null? next-b)
(set-cdr! b a)
(scan-b b a (car a) next-b (car next-b))))
b)
;; A starts the answer list.
(else (let ((next-a (cdr a)))
(if (null? next-a)
(set-cdr! a b)
(scan-a a next-a (car next-a) b (car b))))
a))))
;;; Copyright
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; This code is
;;; Copyright (c) 1998 by Olin Shivers.
;;; The terms are: You may do as you please with this code, as long as
;;; you do not delete this notice or hold me responsible for any outcome
;;; related to its use.
;;;
;;; Blah blah blah.
;;; Code tuning & porting
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; This is very portable code. It's R4RS with the following exceptions:
;;; - The R5RS multiple-value VALUES & CALL-WITH-VALUES procedures for
;;; handling multiple-value return.
;;;
;;; This code is *tightly* bummed as far as I can go in portable Scheme.
;;;
;;; - The fixnum arithmetic in LIST-MERGE-SORT! and COUNTED-LIST-MERGE!
;;; that could be safely switched over to unsafe, fixnum-specific ops,
;;; if you're sure that 2*maxlen is a fixnum, where maxlen is the length
;;; of the longest list you could ever have.
;;;
;;; - I typically write my code in a style such that every CAR and CDR
;;; application is protected by an upstream PAIR?. This is the case in this
;;; code, so all the CAR's and CDR's could safely switched over to unsafe
;;; versions. But check over the code before you do it, in case the source
;;; has been altered since I wrote this.