Wishful Coding

Didn't you ever wish your
computer understood you?

Take n distinct random items from a vector

Another IRC hacking session.

plot

plot

(ns test
  (:use (incanter core charts)))

; naive, O(n+m)
(defn take-rand1 [n coll] (take n (shuffle coll)))

; lazy, O(n!@#$%m^&)
(defn take-rand2 [n coll]
  (take n (distinct (repeatedly #(rand-nth coll)))))

; reduce, reorder, subvec, O(m)
(defn take-rand3 [nr coll]
  (let [len (count coll)
        ; why doesn't rand-int take a start?
        rand-int (fn [lo hi] (+ lo (rand-int (- hi lo))))]
    (subvec (->> (range nr)
                 (reduce #(conj %1 [%2 (rand-int %2 len)]) [])
                 (reduce
                   (fn swap [a [i b]]
                      (assoc a b (get a i) i (get a b)))
                   coll))
            0 nr)))

; amalloy, O(m)
(defn take-rand4 [num coll]
  (first
   (nth
    (iterate (fn [[ret candidates]]
               (let [idx (rand-int (count candidates))]
                 [(conj ret (candidates idx))
                  (subvec (assoc candidates idx (candidates 0))
                          1)]))
             [[]
              coll])
    num)))

(defn t [f]
  (let [start (. System (nanoTime))]
    (f)
     (- (. System (nanoTime)) start)))

(defn plot-len [f n]
  (let [coll (vec (range n))]
    (t #(doall (f 1000 coll)))))

(defn plot-take [f n]
  (let [coll (vec (range 100000))]
    (t #(doall (f n coll)))))

(def x (range 1000 100000 1000))

(-> (scatter-plot x (map (partial plot-len  take-rand1) x) :series-label "shuffle" :legend true)
    (add-points   x (map (partial plot-len  take-rand2) x) :series-label "filtered")
    (add-points   x (map (partial plot-len  take-rand3) x) :series-label "reduce")
    (add-points   x (map (partial plot-len  take-rand4) x) :series-label "iterate")
    (save "len.png")) ; http://yfrog.com/5r80779474p

(-> (scatter-plot x (map (partial plot-take take-rand1) x) :series-label "shuffle" :legend true)
    (add-points   x (map (partial plot-take take-rand2) x) :series-label "filtered")
    (add-points   x (map (partial plot-take take-rand3) x) :series-label "reduce")
    (add-points   x (map (partial plot-take take-rand4) x) :series-label "iterate")
    (save "take.png")) ; http://yfrog.com/mr42647628p
Published on

Sorting obsession

It all started when, one day, I was thinking about the little I knew about sorting. What I knew was that sorting was usually done by comparing all items and switching them around, hitting the magical O(nlogn) barrier.

Thinking about how I would sort a deck of cards, sorting seemed more like a O(n) operation. With that in mind, I started implementing what turned out to be called Pigeonhole Sort. I also learned that sorting algorithms can be classified as being a Comparison sort or not.

(ns pigeonhole-sort)

(defn int-sort [s]
  (let [listmap (reduce #(update-in
                           (update-in %1 [%2] (fnil inc 0))
                           [:max] max %2) {:max 0} s)]
    (mapcat #(repeat (get listmap % 0) %)
            (range (inc (:max listmap))))))

With that out of the way, I started to compare my sort to the sort used by Java and Clojure, which turned out to be using Timsort, originating in Python. In my quest to understand Timsort - and why it was faster than my Pigeonhole Sort - I found this great website that graphs how different sort algorithms work.

Simply put, I was intrigued by the pattern displayed by Heap Sort. So next I found myself reading the Wikipedia pages for Binary Heaps, and shortly after that, I had started implementing that in Clojure.

This is, ladies and gentleman, a persistent heap, in all its heapyness, with a heapsort implemented on top of it.

(ns persistent-heap)

(defn swap [heap idx idy]
  (assoc heap idx (get heap idy) idy (get heap idx)))

(defn children [idx]
  (let [idx (inc (* idx 2))
        idy (inc idx)]
    [idx idy]))

(defn parent [idx]
  (if (not= 0 idx)
    (/ (- idx (if (odd? idx) 1 2)) 2)
    nil))

(defn tree 
  ([heap] (tree heap 0))
  ([heap idx]
   (let [[left right] (children idx)
         node (get heap idx nil)]
     (when node
       [node (tree heap left) (tree heap right)]))))

(defn heap-up 
  ([heap] (heap-up heap >= (dec (count heap))))
  ([heap compfn] (heap-up heap compfn (dec (count heap))))
  ([heap compfn idx]
   (if-let [par (parent idx)]
     (if (compfn (get heap idx) (get heap par))
       (recur (swap heap idx par) compfn par)
       heap)
     heap)))

(defn heap-down
  ([heap] (heap-down (pop (swap heap 0 (dec (count heap)))) >= 0))
  ([heap compfn] (heap-down (pop (swap heap 0 (dec (count heap)))) compfn 0))
  ([heap compfn idx]
   (let [[left right] (children idx)
         leftv (get heap left nil)
         rightv (get heap right nil)
         node (get heap idx nil)]
     (if (and node leftv rightv)
       (cond
         (compfn leftv (max rightv node))
           (recur (swap heap idx left) compfn left)
         (compfn rightv (max leftv node))
           (recur (swap heap idx right) compfn right)
         :else heap)
       heap))))

(deftype PersistentHeap [heap]
         clojure.lang.ISeq
         (first [this] (first heap))
         (next [this] (PersistentHeap. (heap-down heap)))
         (more [this] (.next this))
         (cons [this obj] (PersistentHeap. (heap-up (conj heap obj))))
         (seq [this] (seq heap)))

(defn persistent-heap [coll]
  (into (PersistentHeap. []) coll))

(defn heapsort [coll]
  (->> (persistent-heap coll)
       (iterate rest)
       (take (count coll))
       (map first)))

Some 'benchmark' results:

user=> (time (dorun (heapsort (shuffle (range 1e5)))))
"Elapsed time: 3148.041 msecs"
nil
user=> (time (dorun (int-sort (shuffle (range 1e5)))))
"Elapsed time: 477.781 msecs"
nil
user=> (time (dorun (sort (shuffle (range 1e5)))))
"Elapsed time: 105.354 msecs"

Note that heapsort is lazy, insofar that it does realize but does not sort the sequence untill requested. There is also a bug that allows for the last 2 items to be out of order.

I'm tired of the NoSQL buzz, and bring you SlouchDB

It has all the features your favorite NoSQL database has.

  • Fast
  • Scalable
  • Concurrent
  • Key/value store
  • Document orientated
  • MapReduce
  • In-memory
  • Persistent *
  • NoSQL
  • REST API *
  • Written in X, so...
* Requires a few extra lines.

The implementation looks like this.

(ns slouchdb)

; Fast scalable in-memory concurrent NoSQL database
(def db (ref {:bob  {:age 59 :sex :male}
              :bill {:age 17 :sex :male}
              :mary {:age 28 :sex :female}}))

;; Views ;;

(defn total-age []
  (reduce + (pmap (comp :age val) @db))) ; Parallel MapReduce!

;; Examples ;;

(println (apply str (interpose \newline
  ["Get an item"
  (:bob @db)
  "MapReduce"
  (total-age)
  "Add/update an item"
  (dosync (alter db assoc-in [:bill :age] 25))
  "Delete an item"
  (dosync (alter db dissoc :mary))])))

; For scaling, press CMD or CTRL and + or -

; For a persistent version, see java.io.Serializable

; For a REST API, check out Moustache and Aleph.
Published on