|
| 1 | +^{:kindly/hide-code true |
| 2 | + :clay {:title "Visual data summaries" |
| 3 | + :quarto {:type :post |
| 4 | + :author [:timothypratley] |
| 5 | + :draft true |
| 6 | + :date "2026-01-011" |
| 7 | + :description "Can we plot interesting charts for all columns of a dataset?" |
| 8 | + :category :data-visualization |
| 9 | + :tags [:datavis :algebra] |
| 10 | + :keywords [:datavis :composition :operators]}}} |
| 11 | +(ns data-visualization.aog.column-combinations |
| 12 | + (:require |
| 13 | + [clojure.string :as str] |
| 14 | + [fastmath.stats :as fms] |
| 15 | + [scicloj.kindly.v4.kind :as kind] |
| 16 | + [scicloj.metamorph.ml.rdatasets :as rdatasets] |
| 17 | + [tablecloth.api :as tc] |
| 18 | + [tablecloth.column.api :as tcc])) |
| 19 | + |
| 20 | +;; When exploring a new dataset, we face an immediate challenge: |
| 21 | +;; How do we quickly understand the structure and distribution of all our columns? |
| 22 | +;; |
| 23 | +;; This notebook explores a "show everything" approach: |
| 24 | + |
| 25 | +;; - Visual summaries for each column (distributions, categories) |
| 26 | +;; - Summary statistics paired with visualizations |
| 27 | +;; - A scatterplot-matrix-like view showing all column combinations |
| 28 | +;; |
| 29 | +;; The goal is to enable rapid visual discovery of patterns and relationships. |
| 30 | + |
| 31 | +;; ## Starting with a Complete Dataset |
| 32 | +;; |
| 33 | +;; Let's load a well-known dataset and explore how to present its columns effectively: |
| 34 | + |
| 35 | +(def penguins |
| 36 | + (tc/drop-missing (rdatasets/palmerpenguins-penguins))) |
| 37 | + |
| 38 | +;; ### Option 1: Print the data |
| 39 | + |
| 40 | +;; We could just print the first few rows, but that only shows a small sample: |
| 41 | + |
| 42 | +penguins |
| 43 | + |
| 44 | +;; ### Option 2: Summary statistics |
| 45 | +;; We could compute statistics for a single column: |
| 46 | + |
| 47 | +(fms/stats-map (:bill-length-mm penguins)) |
| 48 | + |
| 49 | +;; But this requires mental effort to visualize what the numbers mean. |
| 50 | +;; |
| 51 | +;; ### Option 3: Visual summaries |
| 52 | + |
| 53 | +;; What if we automatically plot the distribution of every column? |
| 54 | +;; This lets us see patterns at a glance. |
| 55 | + |
| 56 | +;; ## Visualization inference |
| 57 | + |
| 58 | +(def plot-width 100) |
| 59 | +(def plot-height 100) |
| 60 | + |
| 61 | +;; Type detection: determines whether to show histograms (numeric) or bar charts (categorical) |
| 62 | + |
| 63 | +(defn is-numeric-type? [col] |
| 64 | + (let [col-type (tcc/typeof col)] |
| 65 | + (or (= col-type :float32) |
| 66 | + (= col-type :float64) |
| 67 | + (= col-type :int32) |
| 68 | + (= col-type :int64)))) |
| 69 | + |
| 70 | +(defn plot-basic [g] |
| 71 | + (let [{:keys [data mappings geometry]} (g 1) |
| 72 | + {:keys [x y]} mappings] |
| 73 | + (for [geom geometry] |
| 74 | + (case geom |
| 75 | + :bar (let [x-vals (remove nil? (data x)) |
| 76 | + categories (distinct x-vals) |
| 77 | + counts (frequencies x-vals) |
| 78 | + max-count (when (seq counts) (apply max (vals counts))) |
| 79 | + bar-width (/ plot-width (count categories))] |
| 80 | + (when max-count |
| 81 | + (for [[i cat] (map-indexed vector categories)] |
| 82 | + (let [count (get counts cat 0) |
| 83 | + bar-height (* (/ count max-count) plot-height)] |
| 84 | + [:rect {:x (* i bar-width) |
| 85 | + :y (- plot-height bar-height) |
| 86 | + :width bar-width |
| 87 | + :height bar-height |
| 88 | + :fill "lightblue" |
| 89 | + :stroke "gray" |
| 90 | + :stroke-width 0.5}])))) |
| 91 | + :histogram (let [values (remove nil? (data x)) |
| 92 | + hist-result (when (seq values) (fms/histogram values)) |
| 93 | + bins (:bins-maps hist-result)] |
| 94 | + (when (seq bins) |
| 95 | + (let [max-count (apply max (map :count bins)) |
| 96 | + bin-width (/ plot-width (count bins))] |
| 97 | + (for [[i bin] (map-indexed vector bins)] |
| 98 | + (let [bar-height (* (/ (:count bin) max-count) plot-height)] |
| 99 | + [:rect {:x (* i bin-width) |
| 100 | + :y (- plot-height bar-height) |
| 101 | + :width bin-width |
| 102 | + :height bar-height |
| 103 | + :fill "lightblue" |
| 104 | + :stroke "gray" |
| 105 | + :stroke-width 0.5}]))))) |
| 106 | + :point (let [xys (mapv (juxt x y) data)] |
| 107 | + (for [[x y] xys] |
| 108 | + [:circle {:r 2, :cx x, :cy y, :fill "lightblue"}])) |
| 109 | + :line (let [xys (mapv (juxt x y) data)] |
| 110 | + [:path {:d (str "M " (str/join "," |
| 111 | + (first xys)) |
| 112 | + " L " (str/join " " |
| 113 | + (map #(str/join "," %) |
| 114 | + (rest xys))))}]))))) |
| 115 | + |
| 116 | +(defn plot-distribution [ds column geom] |
| 117 | + ^:kind/hiccup |
| 118 | + [:svg {:width 100 |
| 119 | + :viewBox (str/join " " [0 0 plot-width plot-height]) |
| 120 | + :xmlns "http://www.w3.org/2000/svg" |
| 121 | + :style {:border "solid 1px gray"}} |
| 122 | + [:g {:stroke "gray", :fill "none"} |
| 123 | + (plot-basic [:graphic {:data ds |
| 124 | + :mappings {:x column} |
| 125 | + :geometry geom}])]]) |
| 126 | + |
| 127 | +(plot-distribution penguins :bill-length-mm [:histogram]) |
| 128 | + |
| 129 | +;; ## Single Column Summaries |
| 130 | +;; |
| 131 | +;; The summarize function automatically selects the right visualization type: |
| 132 | + |
| 133 | +;; - Numeric columns → histogram (shows distribution shape) |
| 134 | +;; - Categorical columns → bar chart (shows frequencies) |
| 135 | + |
| 136 | +(defn summarize [ds column] |
| 137 | + (if (is-numeric-type? (ds column)) |
| 138 | + (plot-distribution ds column [:histogram]) |
| 139 | + (plot-distribution ds column [:bar]))) |
| 140 | + |
| 141 | +;; Companion function: provides numeric summaries alongside visualizations |
| 142 | +;; Shows count, mean, standard deviation, min/max for numeric data |
| 143 | +;; Shows count and unique values for categorical data |
| 144 | + |
| 145 | +(defn get-summary-stats [ds column] |
| 146 | + (let [col (ds column)] |
| 147 | + (if (is-numeric-type? col) |
| 148 | + (let [stats (fms/stats-map (remove nil? col))] |
| 149 | + (format "n: %d, μ: %.2f, σ: %.2f, min: %.2f, max: %.2f" |
| 150 | + (:Size stats) |
| 151 | + (:Mean stats) |
| 152 | + (:SD stats) |
| 153 | + (:Min stats) |
| 154 | + (:Max stats))) |
| 155 | + (let [values (remove nil? col) |
| 156 | + counts (frequencies values)] |
| 157 | + (str "n: " (count values) ", unique: " (count counts)))))) |
| 158 | + |
| 159 | +;; ## Summary Table: All Columns at a Glance |
| 160 | +;; |
| 161 | +;; Combines visualization + statistics for every column. |
| 162 | +;; This gives us a complete overview of the dataset's structure. |
| 163 | + |
| 164 | +(defn visual-summary [ds] |
| 165 | + (kind/table |
| 166 | + (doall (for [column-name (tc/column-names ds)] |
| 167 | + [column-name (summarize ds column-name) (get-summary-stats ds column-name)])))) |
| 168 | + |
| 169 | +(visual-summary penguins) |
| 170 | + |
| 171 | +;; ## Matrix View: All Column Combinations |
| 172 | +;; |
| 173 | +;; The next step: instead of showing each column separately, |
| 174 | +;; what if we show how every column relates to every other column? |
| 175 | +;; This is the idea behind the scatterplot matrix. |
| 176 | +;; |
| 177 | +;; The matrix automatically chooses the right chart for each combination: |
| 178 | + |
| 179 | +;; - Numeric × Numeric → scatter plot (reveal relationships) |
| 180 | +;; - Otherwise → bar chart (show distribution differences) |
| 181 | + |
| 182 | +(defn matrix [ds] |
| 183 | + (let [column-names (tc/column-names ds) |
| 184 | + c (count column-names)] |
| 185 | + ^:kind/hiccup |
| 186 | + [:svg {:width "100%" |
| 187 | + :viewBox (str/join " " [0 0 (* plot-width c) (* plot-height c)]) |
| 188 | + :xmlns "http://www.w3.org/2000/svg" |
| 189 | + :style {:border "solid 1px gray"}} |
| 190 | + [:g {:stroke "gray", :fill "none"} |
| 191 | + (for [[a-idx a] (map-indexed vector column-names) |
| 192 | + [b-idx b] (map-indexed vector column-names)] |
| 193 | + (let [col-a (ds a) |
| 194 | + col-b (ds b) |
| 195 | + a-numeric? (is-numeric-type? col-a) |
| 196 | + b-numeric? (is-numeric-type? col-b)] |
| 197 | + [:g {:transform (str "translate(" (* a-idx plot-width) "," (* b-idx plot-height) ")")} |
| 198 | + [:rect {:x 0 :y 0 :width plot-width :height plot-height |
| 199 | + :fill "none" :stroke "gray" :stroke-width 1}] |
| 200 | + (plot-basic [:graphic {:data ds |
| 201 | + :mappings {:x a :y b} |
| 202 | + :geometry (cond |
| 203 | + (and a-numeric? b-numeric?) [:point] |
| 204 | + :else [:bar])}])]))]])) |
| 205 | + |
| 206 | +(matrix penguins) |
0 commit comments