Skip to content

Commit ef145ce

Browse files
add column-combos
1 parent f68ffe6 commit ef145ce

File tree

1 file changed

+206
-0
lines changed

1 file changed

+206
-0
lines changed
Lines changed: 206 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,206 @@
1+
^{:kindly/hide-code true
2+
:clay {:title "Visual data summaries"
3+
:quarto {:type :post
4+
:author [:timothypratley]
5+
:draft true
6+
:date "2026-01-011"
7+
:description "Can we plot interesting charts for all columns of a dataset?"
8+
:category :data-visualization
9+
:tags [:datavis :algebra]
10+
:keywords [:datavis :composition :operators]}}}
11+
(ns data-visualization.aog.column-combinations
12+
(:require
13+
[clojure.string :as str]
14+
[fastmath.stats :as fms]
15+
[scicloj.kindly.v4.kind :as kind]
16+
[scicloj.metamorph.ml.rdatasets :as rdatasets]
17+
[tablecloth.api :as tc]
18+
[tablecloth.column.api :as tcc]))
19+
20+
;; When exploring a new dataset, we face an immediate challenge:
21+
;; How do we quickly understand the structure and distribution of all our columns?
22+
;;
23+
;; This notebook explores a "show everything" approach:
24+
25+
;; - Visual summaries for each column (distributions, categories)
26+
;; - Summary statistics paired with visualizations
27+
;; - A scatterplot-matrix-like view showing all column combinations
28+
;;
29+
;; The goal is to enable rapid visual discovery of patterns and relationships.
30+
31+
;; ## Starting with a Complete Dataset
32+
;;
33+
;; Let's load a well-known dataset and explore how to present its columns effectively:
34+
35+
(def penguins
36+
(tc/drop-missing (rdatasets/palmerpenguins-penguins)))
37+
38+
;; ### Option 1: Print the data
39+
40+
;; We could just print the first few rows, but that only shows a small sample:
41+
42+
penguins
43+
44+
;; ### Option 2: Summary statistics
45+
;; We could compute statistics for a single column:
46+
47+
(fms/stats-map (:bill-length-mm penguins))
48+
49+
;; But this requires mental effort to visualize what the numbers mean.
50+
;;
51+
;; ### Option 3: Visual summaries
52+
53+
;; What if we automatically plot the distribution of every column?
54+
;; This lets us see patterns at a glance.
55+
56+
;; ## Visualization inference
57+
58+
(def plot-width 100)
59+
(def plot-height 100)
60+
61+
;; Type detection: determines whether to show histograms (numeric) or bar charts (categorical)
62+
63+
(defn is-numeric-type? [col]
64+
(let [col-type (tcc/typeof col)]
65+
(or (= col-type :float32)
66+
(= col-type :float64)
67+
(= col-type :int32)
68+
(= col-type :int64))))
69+
70+
(defn plot-basic [g]
71+
(let [{:keys [data mappings geometry]} (g 1)
72+
{:keys [x y]} mappings]
73+
(for [geom geometry]
74+
(case geom
75+
:bar (let [x-vals (remove nil? (data x))
76+
categories (distinct x-vals)
77+
counts (frequencies x-vals)
78+
max-count (when (seq counts) (apply max (vals counts)))
79+
bar-width (/ plot-width (count categories))]
80+
(when max-count
81+
(for [[i cat] (map-indexed vector categories)]
82+
(let [count (get counts cat 0)
83+
bar-height (* (/ count max-count) plot-height)]
84+
[:rect {:x (* i bar-width)
85+
:y (- plot-height bar-height)
86+
:width bar-width
87+
:height bar-height
88+
:fill "lightblue"
89+
:stroke "gray"
90+
:stroke-width 0.5}]))))
91+
:histogram (let [values (remove nil? (data x))
92+
hist-result (when (seq values) (fms/histogram values))
93+
bins (:bins-maps hist-result)]
94+
(when (seq bins)
95+
(let [max-count (apply max (map :count bins))
96+
bin-width (/ plot-width (count bins))]
97+
(for [[i bin] (map-indexed vector bins)]
98+
(let [bar-height (* (/ (:count bin) max-count) plot-height)]
99+
[:rect {:x (* i bin-width)
100+
:y (- plot-height bar-height)
101+
:width bin-width
102+
:height bar-height
103+
:fill "lightblue"
104+
:stroke "gray"
105+
:stroke-width 0.5}])))))
106+
:point (let [xys (mapv (juxt x y) data)]
107+
(for [[x y] xys]
108+
[:circle {:r 2, :cx x, :cy y, :fill "lightblue"}]))
109+
:line (let [xys (mapv (juxt x y) data)]
110+
[:path {:d (str "M " (str/join ","
111+
(first xys))
112+
" L " (str/join " "
113+
(map #(str/join "," %)
114+
(rest xys))))}])))))
115+
116+
(defn plot-distribution [ds column geom]
117+
^:kind/hiccup
118+
[:svg {:width 100
119+
:viewBox (str/join " " [0 0 plot-width plot-height])
120+
:xmlns "http://www.w3.org/2000/svg"
121+
:style {:border "solid 1px gray"}}
122+
[:g {:stroke "gray", :fill "none"}
123+
(plot-basic [:graphic {:data ds
124+
:mappings {:x column}
125+
:geometry geom}])]])
126+
127+
(plot-distribution penguins :bill-length-mm [:histogram])
128+
129+
;; ## Single Column Summaries
130+
;;
131+
;; The summarize function automatically selects the right visualization type:
132+
133+
;; - Numeric columns → histogram (shows distribution shape)
134+
;; - Categorical columns → bar chart (shows frequencies)
135+
136+
(defn summarize [ds column]
137+
(if (is-numeric-type? (ds column))
138+
(plot-distribution ds column [:histogram])
139+
(plot-distribution ds column [:bar])))
140+
141+
;; Companion function: provides numeric summaries alongside visualizations
142+
;; Shows count, mean, standard deviation, min/max for numeric data
143+
;; Shows count and unique values for categorical data
144+
145+
(defn get-summary-stats [ds column]
146+
(let [col (ds column)]
147+
(if (is-numeric-type? col)
148+
(let [stats (fms/stats-map (remove nil? col))]
149+
(format "n: %d, μ: %.2f, σ: %.2f, min: %.2f, max: %.2f"
150+
(:Size stats)
151+
(:Mean stats)
152+
(:SD stats)
153+
(:Min stats)
154+
(:Max stats)))
155+
(let [values (remove nil? col)
156+
counts (frequencies values)]
157+
(str "n: " (count values) ", unique: " (count counts))))))
158+
159+
;; ## Summary Table: All Columns at a Glance
160+
;;
161+
;; Combines visualization + statistics for every column.
162+
;; This gives us a complete overview of the dataset's structure.
163+
164+
(defn visual-summary [ds]
165+
(kind/table
166+
(doall (for [column-name (tc/column-names ds)]
167+
[column-name (summarize ds column-name) (get-summary-stats ds column-name)]))))
168+
169+
(visual-summary penguins)
170+
171+
;; ## Matrix View: All Column Combinations
172+
;;
173+
;; The next step: instead of showing each column separately,
174+
;; what if we show how every column relates to every other column?
175+
;; This is the idea behind the scatterplot matrix.
176+
;;
177+
;; The matrix automatically chooses the right chart for each combination:
178+
179+
;; - Numeric × Numeric → scatter plot (reveal relationships)
180+
;; - Otherwise → bar chart (show distribution differences)
181+
182+
(defn matrix [ds]
183+
(let [column-names (tc/column-names ds)
184+
c (count column-names)]
185+
^:kind/hiccup
186+
[:svg {:width "100%"
187+
:viewBox (str/join " " [0 0 (* plot-width c) (* plot-height c)])
188+
:xmlns "http://www.w3.org/2000/svg"
189+
:style {:border "solid 1px gray"}}
190+
[:g {:stroke "gray", :fill "none"}
191+
(for [[a-idx a] (map-indexed vector column-names)
192+
[b-idx b] (map-indexed vector column-names)]
193+
(let [col-a (ds a)
194+
col-b (ds b)
195+
a-numeric? (is-numeric-type? col-a)
196+
b-numeric? (is-numeric-type? col-b)]
197+
[:g {:transform (str "translate(" (* a-idx plot-width) "," (* b-idx plot-height) ")")}
198+
[:rect {:x 0 :y 0 :width plot-width :height plot-height
199+
:fill "none" :stroke "gray" :stroke-width 1}]
200+
(plot-basic [:graphic {:data ds
201+
:mappings {:x a :y b}
202+
:geometry (cond
203+
(and a-numeric? b-numeric?) [:point]
204+
:else [:bar])}])]))]]))
205+
206+
(matrix penguins)

0 commit comments

Comments
 (0)