Wishful Coding

Take n distinct random items from a vector

Another IRC hacking session.

plot

(ns test
  (:use (incanter core charts)))

; naive, O(n+m)
(defn take-rand1 [n coll] (take n (shuffle coll)))

; lazy, O(n!@#$%m^&)
(defn take-rand2 [n coll]
  (take n (distinct (repeatedly #(rand-nth coll)))))

; reduce, reorder, subvec, O(m)
(defn take-rand3 [nr coll]
  (let [len (count coll)
        ; why doesn't rand-int take a start?
        rand-int (fn [lo hi] (+ lo (rand-int (- hi lo))))]
    (subvec (->> (range nr)
                 (reduce #(conj %1 [%2 (rand-int %2 len)]) [])
                 (reduce
                   (fn swap [a [i b]]
                      (assoc a b (get a i) i (get a b)))
                   coll))
            0 nr)))

; amalloy, O(m)
(defn take-rand4 [num coll]
  (first
   (nth
    (iterate (fn [[ret candidates]]
               (let [idx (rand-int (count candidates))]
                 [(conj ret (candidates idx))
                  (subvec (assoc candidates idx (candidates 0))
                          1)]))
             [[]
              coll])
    num)))

(defn t [f]
  (let [start (. System (nanoTime))]
    (f)
     (- (. System (nanoTime)) start)))

(defn plot-len [f n]
  (let [coll (vec (range n))]
    (t #(doall (f 1000 coll)))))

(defn plot-take [f n]
  (let [coll (vec (range 100000))]
    (t #(doall (f n coll)))))

(def x (range 1000 100000 1000))

(-> (scatter-plot x (map (partial plot-len  take-rand1) x) :series-label "shuffle" :legend true)
    (add-points   x (map (partial plot-len  take-rand2) x) :series-label "filtered")
    (add-points   x (map (partial plot-len  take-rand3) x) :series-label "reduce")
    (add-points   x (map (partial plot-len  take-rand4) x) :series-label "iterate")
    (save "len.png")) ; http://yfrog.com/5r80779474p

(-> (scatter-plot x (map (partial plot-take take-rand1) x) :series-label "shuffle" :legend true)
    (add-points   x (map (partial plot-take take-rand2) x) :series-label "filtered")
    (add-points   x (map (partial plot-take take-rand3) x) :series-label "reduce")
    (add-points   x (map (partial plot-take take-rand4) x) :series-label "iterate")
    (save "take.png")) ; http://yfrog.com/mr42647628p

Published on 01 February 2011.

clojure

Sorting obsession

It all started when, one day, I was thinking about the little I knew about sorting. What I knew was that sorting was usually done by comparing all items and switching them around, hitting the magical O(nlogn) barrier.

Thinking about how I would sort a deck of cards, sorting seemed more like a O(n) operation. With that in mind, I started implementing what turned out to be called Pigeonhole Sort. I also learned that sorting algorithms can be classified as being a Comparison sort or not.

(ns pigeonhole-sort)

(defn int-sort [s]
  (let [listmap (reduce #(update-in
                           (update-in %1 [%2] (fnil inc 0))
                           [:max] max %2) {:max 0} s)]
    (mapcat #(repeat (get listmap % 0) %)
            (range (inc (:max listmap))))))

With that out of the way, I started to compare my sort to the sort used by Java and Clojure, which turned out to be using Timsort, originating in Python. In my quest to understand Timsort - and why it was faster than my Pigeonhole Sort - I found this great website that graphs how different sort algorithms work.

Simply put, I was intrigued by the pattern displayed by Heap Sort. So next I found myself reading the Wikipedia pages for Binary Heaps, and shortly after that, I had started implementing that in Clojure.

This is, ladies and gentleman, a persistent heap, in all its heapyness, with a heapsort implemented on top of it.

(ns persistent-heap)

(defn swap [heap idx idy]
  (assoc heap idx (get heap idy) idy (get heap idx)))

(defn children [idx]
  (let [idx (inc (* idx 2))
        idy (inc idx)]
    [idx idy]))

(defn parent [idx]
  (if (not= 0 idx)
    (/ (- idx (if (odd? idx) 1 2)) 2)
    nil))

(defn tree 
  ([heap] (tree heap 0))
  ([heap idx]
   (let [[left right] (children idx)
         node (get heap idx nil)]
     (when node
       [node (tree heap left) (tree heap right)]))))

(defn heap-up 
  ([heap] (heap-up heap >= (dec (count heap))))
  ([heap compfn] (heap-up heap compfn (dec (count heap))))
  ([heap compfn idx]
   (if-let [par (parent idx)]
     (if (compfn (get heap idx) (get heap par))
       (recur (swap heap idx par) compfn par)
       heap)
     heap)))

(defn heap-down
  ([heap] (heap-down (pop (swap heap 0 (dec (count heap)))) >= 0))
  ([heap compfn] (heap-down (pop (swap heap 0 (dec (count heap)))) compfn 0))
  ([heap compfn idx]
   (let [[left right] (children idx)
         leftv (get heap left nil)
         rightv (get heap right nil)
         node (get heap idx nil)]
     (if (and node leftv rightv)
       (cond
         (compfn leftv (max rightv node))
           (recur (swap heap idx left) compfn left)
         (compfn rightv (max leftv node))
           (recur (swap heap idx right) compfn right)
         :else heap)
       heap))))

(deftype PersistentHeap [heap]
         clojure.lang.ISeq
         (first [this] (first heap))
         (next [this] (PersistentHeap. (heap-down heap)))
         (more [this] (.next this))
         (cons [this obj] (PersistentHeap. (heap-up (conj heap obj))))
         (seq [this] (seq heap)))

(defn persistent-heap [coll]
  (into (PersistentHeap. []) coll))

(defn heapsort [coll]
  (->> (persistent-heap coll)
       (iterate rest)
       (take (count coll))
       (map first)))

Some 'benchmark' results:

user=> (time (dorun (heapsort (shuffle (range 1e5)))))
"Elapsed time: 3148.041 msecs"
nil
user=> (time (dorun (int-sort (shuffle (range 1e5)))))
"Elapsed time: 477.781 msecs"
nil
user=> (time (dorun (sort (shuffle (range 1e5)))))
"Elapsed time: 105.354 msecs"

Note that heapsort is lazy, insofar that it does realize but does not sort the sequence untill requested. There is also a bug that allows for the last 2 items to be out of order.

Published on 15 January 2011.

clojure sorting

I'm tired of the NoSQL buzz, and bring you SlouchDB

It has all the features your favorite NoSQL database has.

Fast
Scalable
Concurrent
Key/value store
Document orientated
MapReduce
In-memory
Persistent *
NoSQL
REST API *
Written in X, so...

* Requires a few extra lines.

The implementation looks like this.

(ns slouchdb)

; Fast scalable in-memory concurrent NoSQL database
(def db (ref {:bob  {:age 59 :sex :male}
              :bill {:age 17 :sex :male}
              :mary {:age 28 :sex :female}}))

;; Views ;;

(defn total-age []
  (reduce + (pmap (comp :age val) @db))) ; Parallel MapReduce!

;; Examples ;;

(println (apply str (interpose \newline
  ["Get an item"
  (:bob @db)
  "MapReduce"
  (total-age)
  "Add/update an item"
  (dosync (alter db assoc-in [:bill :age] 25))
  "Delete an item"
  (dosync (alter db dissoc :mary))])))

; For scaling, press CMD or CTRL and + or -

; For a persistent version, see java.io.Serializable

; For a REST API, check out Moustache and Aleph.

Published on 13 January 2011.

nosql

Previous Page 69 of 90 Next