scsh-0.6/scheme/sort/delndups.scm

;;; The SRFI-32 sort package -- delete neighboring duplicate elts
;;; Copyright (c) 1998 by Olin Shivers.
;;; This code is open-source; see the end of the file for porting and
;;; more copyright information.
;;; Olin Shivers 11/98.

;;; Problem:
;;; vector-delete-neighbor-dups pushes N stack frames, where N is the number
;;; of elements in the answer vector. This is arguably a very efficient thing
;;; to do, but it might blow out on a system with a limited stack but a big
;;; heap. We could rewrite this to "chunk" up answers in temp vectors if we
;;; push more than a certain number of frames, then allocate a final answer,
;;; copying all the chunks into the answer. But it's much more complex code.

;;; Exports:
;;; (list-delete-neighbor-dups  = lis) -> list
;;; (list-delete-neighbor-dups! = lis) -> list
;;; (vector-delete-neighbor-dups  = v [start end]) -> vector
;;; (vector-delete-neighbor-dups! = v [start end]) -> end'

;;; These procedures delete adjacent duplicate elements from a list or
;;; a vector, using a given element equality procedure. The first or leftmost
;;; element of a run of equal elements is the one that survives. The list
;;; or vector is not otherwise disordered.
;;;
;;; These procedures are linear time -- much faster than the O(n^2) general
;;; duplicate-elt deletors that do not assume any "bunching" of elements.
;;; If you want to delete duplicate elements from a large list or vector,
;;; sort the elements to bring equal items together, then use one of these
;;; procedures -- for a total time of O(n lg n).

;;; LIST-DELETE-NEIGHBOR-DUPS
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; Below are multiple versions of the LIST-DELETE-NEIGHBOR-DUPS procedure,
;;; from simple to complex. RECUR's contract: Strip off any leading X's from
;;; LIS, and return that list neighbor-dup-deleted.
;;;
;;; The final version
;;; - shares a common subtail between the input & output list, up to 1024
;;;   elements;
;;; - Needs no more than 1024 stack frames.

;;; Simplest version.
;;; - Always allocates a fresh list / never shares storage.
;;; - Needs N stack frames, if answer is length N.
(define (list-delete-neighbor-dups = lis)
  (if (pair? lis)
      (let ((x0 (car lis)))
	(cons x0 (let recur ((x0 x0) (xs (cdr lis)))
		   (if (pair? xs)
		       (let ((x1  (car xs))
			     (x2+ (cdr xs)))
			  (if (= x0 x1)
			      (recur x0 x2+) ; Loop, actually.
			      (cons x1 (recur x1 x2+))))
		       xs))))
      lis))

;;; This version tries to use cons cells from input by sharing longest
;;; common tail between input & output. Still needs N stack frames, for ans
;;; of length N.
(define (list-delete-neighbor-dups = lis)
  (if (pair? lis)
      (let* ((x0 (car lis))
	     (xs (cdr lis))
	     (ans (let recur ((x0 x0) (xs xs))
		    (if (pair? xs)
			(let ((x1  (car xs))
			      (x2+ (cdr xs)))
			  (if (= x0 x1)
			      (recur x0 x2+)
			      (let ((ans-tail (recur x1 x2+)))
				(if (eq? ans-tail x2+) xs
				    (cons x1 ans-tail)))))
			xs))))
	(if (eq? ans xs) lis (cons x0 ans)))

      lis))

;;; This version tries to share the longest common tail between input & output,
;;; but it also refuses to push more than 1024 stack frames (which bounds the
;;; length of the shared suffix, as well). Useful for enormous lists that
;;; might otherwise blow out your stack. It basically computes 1024-element
;;; chunks of the answer, and then strings these results together using
;;; SET-CDR! to point the last cons cell of each chunk to the first cons cell
;;; of the following chunk.

(define (list-delete-neighbor-dups = lis)
  (letrec ((lp (lambda (last-pair xs)
		 (format #t "lp(last-pair=~a, xs=~a)~%" last-pair xs)
		 (if (pair? xs)
		     (let ((x0  (car xs))
			   (x1+ (cdr xs)))
		       (receive (chunk last-pair2 xs) (recur xs 2)
			 (format #t "<-recur: chunk=~a lp=~a xs=~a~%"
				 chunk last-pair2 xs)
			 (set-cdr! last-pair chunk)
			 (lp last-pair2 xs))))))

	   (kill (lambda (xs)
		   (let ((y (car xs)))
		     (values y (let lp ((xs (cdr xs)))
				 (if (pair? xs)
				     (let ((x0  (car xs))
					   (x1+ (cdr xs)))
				       (if (= y x0) (lp x1+) x1+))
				     '()))))))

	   (recur (lambda (xs n)
		    (format #t "->recur(xs=~a, n=~a)~%" xs n)
		    (if (pair? xs)
			(receive (x0 rest) (kill xs)
			  (if (pair? rest)
			      (if (< 1 n)
				  (receive (c lpr r) (recur rest (- n 1))
				    (values (cons x0 c) lpr r))
				  (receive (xn rest) (kill rest)
				    (let ((lpr (list xn)))
				      (values (cons x0 lpr) lpr rest))))

			      (values (list x0) '() '())))

			(values '() '() '())))))
    (if (pair? lis)
	(let ((x0  (car lis))
	      (x1+ (cdr lis)))
	  (receive (chunk last-pair rest) (recur lis 2)
	    (lp last-pair rest)
	    chunk))
	lis)))


;;; LIST-DELETE-NEIGHBOR-DUPS!
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; This code runs in constant list space, constant stack, and also
;;; does only the minimum SET-CDR!'s necessary.

(define (list-delete-neighbor-dups! = lis)
  (if (pair? lis)
      (let lp1 ((prev lis) (prev-elt (car lis)) (lis (cdr lis)))
	(if (pair? lis)
	    (let ((lis-elt (car lis))
		  (next (cdr lis)))
	      (if (= prev-elt lis-elt)

		  ;; We found the first elts of a run of dups, so we know
		  ;; we're going to have to do a SET-CDR!. Scan to the end of
		  ;; the run, do the SET-CDR!, and loop on LP1.
		  (let lp2 ((lis next))
		    (if (pair? lis)
			(let ((lis-elt (car lis))
			      (next (cdr lis)))
			  (if (= prev-elt lis-elt)
			      (lp2 next)
			      (begin (set-cdr! prev lis)
				     (lp1 lis lis-elt next))))
			(set-cdr! prev lis)))	; Ran off end => quit.

		  (lp1 lis lis-elt next))))))
  lis)


(define (vector-delete-neighbor-dups elt= v . maybe-start+end)
  (call-with-values
   (lambda () (vector-start+end v maybe-start+end))
   (lambda (start end)
     (if (< start end)
	 (let* ((x (vector-ref v start))
		(ans (let recur ((x x) (i start) (j 1))
		       (if (< i end)
			   (let ((y (vector-ref v i))
				 (nexti (+ i 1)))
			     (if (elt= x y)
				 (recur x nexti j)
				 (let ((ansvec (recur y nexti (+ j 1))))
				   (vector-set! ansvec j y)
				   ansvec)))
			   (make-vector j)))))
	   (vector-set! ans 0 x)
	   ans)
	 '#()))))


;;; Packs the surviving elements to the left, in range [start,end'),
;;; and returns END'.
(define (vector-delete-neighbor-dups! elt= v . maybe-start+end)
  (call-with-values
   (lambda () (vector-start+end v maybe-start+end))
   (lambda (start end)

     (if (>= start end)
	 end
	 ;; To eliminate unnecessary copying (read elt i then write the value
	 ;; back at index i), we scan until we find the first dup.
	 (let skip ((j start) (vj (vector-ref v start)))
	   (let ((j+1 (+ j 1)))
	     (if (>= j+1 end)
		 end
		 (let ((vj+1 (vector-ref v j+1)))
		   (if (not (elt= vj vj+1))
		       (skip j+1 vj+1)

		       ;; OK -- j & j+1 are dups, so we're committed to moving
		       ;; data around. In lp2, v[start,j] is what we've done;
		       ;; v[k,end) is what we have yet to handle.
		       (let lp2 ((j j) (vj vj) (k (+ j 2)))
			 (let lp3 ((k k))
			   (if (>= k end)
			       (+ j 1) ; Done.
			       (let ((vk (vector-ref v k))
				     (k+1 (+ k 1)))
				 (if (elt= vj vk)
				     (lp3 k+1)
				     (let ((j+1 (+ j 1)))
				       (vector-set! v j+1 vk)
				       (lp2 j+1 vk k+1))))))))))))))))

;;; Copyright
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; This code is
;;;     Copyright (c) 1998 by Olin Shivers.
;;; The terms are: You may do as you please with this code, as long as
;;; you do not delete this notice or hold me responsible for any outcome
;;; related to its use.
;;;
;;; Blah blah blah. Don't you think source files should contain more lines
;;; of code than copyright notice?
;;;
;;; Code porting
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; If your Scheme has a faster mechanism for handling optional arguments
;;; (e.g., Chez), you should definitely port over to it. Note that argument
;;; defaulting and error-checking are interleaved -- you don't have to
;;; error-check defaulted START/END args to see if they are fixnums that are
;;; legal vector indices for the corresponding vector, etc.