Skip to content

Commit 476b565

Browse files
docs: Use deterministic data in examples
Replaces all calls to `np.random.rand` in the documentation with small, hardcoded datasets. This makes all code examples fully reproducible, deterministic, and easier for new users to understand at a glance.
1 parent 8913586 commit 476b565

File tree

2 files changed

+19
-66
lines changed

2 files changed

+19
-66
lines changed

README.md

Lines changed: 4 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -30,21 +30,10 @@ Here's a quick example of how to create a ROC curve for a single model:
3030
import numpy as np
3131
import rtichoke as rk
3232

33-
# For reproducibility
34-
np.random.seed(42)
35-
36-
# Generate more realistic sample data for a model
37-
# Probabilities for the positive class are generally higher
38-
probs_positive_class = np.random.rand(50) * 0.5 + 0.5 # High probabilities (0.5 to 1.0)
39-
probs_negative_class = np.random.rand(50) * 0.5 # Low probabilities (0.0 to 0.5)
40-
41-
# Combine and shuffle the data
42-
probs_combined = np.concatenate([probs_positive_class, probs_negative_class])
43-
reals_combined = np.concatenate([np.ones(50), np.zeros(50)])
44-
45-
shuffle_index = np.random.permutation(100)
46-
probs = {'Model A': probs_combined[shuffle_index]}
47-
reals = {'Population': reals_combined[shuffle_index]}
33+
# Sample data for a model. Note that the probabilities for the
34+
# positive class (1) are generally higher than for the negative class (0).
35+
probs = {'Model A': np.array([0.1, 0.9, 0.4, 0.8, 0.3, 0.7, 0.2, 0.6])}
36+
reals = {'Population': np.array([0, 1, 0, 1, 0, 1, 0, 1])}
4837

4938

5039
# Create the ROC curve

docs/tutorials/getting_started.qmd

Lines changed: 15 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,6 @@ First, let's import the necessary libraries. We'll need `numpy` for data manipul
1111
```python
1212
import numpy as np
1313
import rtichoke as rk
14-
15-
# For reproducibility
16-
np.random.seed(42)
1714
```
1815

1916
## 2. Understanding the Inputs
@@ -32,15 +29,10 @@ This is the simplest case, where you want to evaluate the performance of a singl
3229
For this, you provide `probs` with a single entry for your model and `reals` with a single entry for the corresponding outcomes.
3330

3431
```python
35-
# Generate realistic sample data for a model
36-
probs_positive_class = np.random.rand(50) * 0.5 + 0.5
37-
probs_negative_class = np.random.rand(50) * 0.5
38-
probs_combined = np.concatenate([probs_positive_class, probs_negative_class])
39-
reals_combined = np.concatenate([np.ones(50), np.zeros(50)])
40-
shuffle_index = np.random.permutation(100)
41-
42-
probs_single = {"Model A": probs_combined[shuffle_index]}
43-
reals_single = {"Population": reals_combined[shuffle_index]}
32+
# Sample data for a model. Note that the probabilities for the
33+
# positive class (1) are generally higher than for the negative class (0).
34+
probs_single = {"Model A": np.array([0.1, 0.9, 0.4, 0.8, 0.3, 0.7, 0.2, 0.6])}
35+
reals_single = {"Population": np.array([0, 1, 0, 1, 0, 1, 0, 1])}
4436

4537
# Create a ROC curve
4638
fig = rk.create_roc_curve(
@@ -60,26 +52,13 @@ Often, you want to compare the performance of several different models on the *s
6052
For this, you provide `probs` with an entry for each model you want to compare. `reals` will still have a single entry, since the outcome data is the same for all models.
6153

6254
```python
63-
# Generate data for two different models to compare.
64-
# Model A has a clearer separation of probabilities.
65-
model_a_probs_pos = np.random.rand(50) * 0.4 + 0.6 # 0.6 to 1.0
66-
model_a_probs_neg = np.random.rand(50) * 0.4 # 0.0 to 0.4
67-
model_a_probs = np.concatenate([model_a_probs_pos, model_a_probs_neg])
68-
69-
# Model B has more overlap.
70-
model_b_probs_pos = np.random.rand(50) * 0.5 + 0.4 # 0.4 to 0.9
71-
model_b_probs_neg = np.random.rand(50) * 0.5 + 0.1 # 0.1 to 0.6
72-
model_b_probs = np.concatenate([model_b_probs_pos, model_b_probs_neg])
73-
74-
reals_comparison_data = np.concatenate([np.ones(50), np.zeros(50)])
75-
shuffle_index_comp = np.random.permutation(100)
76-
55+
# Sample data for two models. Model A is better at separating the classes.
7756
probs_comparison = {
78-
"Model A": model_a_probs[shuffle_index_comp],
79-
"Model B": model_b_probs[shuffle_index_comp],
80-
"Random Guess": np.random.rand(100)
57+
"Model A": np.array([0.1, 0.9, 0.2, 0.8, 0.3, 0.7]),
58+
"Model B": np.array([0.2, 0.8, 0.3, 0.7, 0.4, 0.6]),
59+
"Random Guess": np.array([0.5, 0.5, 0.5, 0.5, 0.5, 0.5])
8160
}
82-
reals_comparison = {"Population": reals_comparison_data[shuffle_index_comp]}
61+
reals_comparison = {"Population": np.array([0, 1, 0, 1, 0, 1])}
8362

8463

8564
# Create a precision-recall curve to compare the models
@@ -98,30 +77,15 @@ This is useful when you want to evaluate a single model's performance across dif
9877
For this, you provide `probs` with an entry for each population and `reals` with a corresponding entry for each population's outcomes.
9978

10079
```python
101-
# Generate sample data for a train and test set.
102-
# Let's assume the model is slightly overfit, performing better on the train set.
103-
104-
# Train set: clear separation
105-
train_probs_pos = np.random.rand(50) * 0.4 + 0.6
106-
train_probs_neg = np.random.rand(50) * 0.4
107-
train_probs = np.concatenate([train_probs_pos, train_probs_neg])
108-
train_reals = np.concatenate([np.ones(50), np.zeros(50)])
109-
train_shuffle = np.random.permutation(100)
110-
111-
# Test set: more overlap
112-
test_probs_pos = np.random.rand(40) * 0.5 + 0.4
113-
test_probs_neg = np.random.rand(40) * 0.5 + 0.1
114-
test_probs = np.concatenate([test_probs_pos, test_probs_neg])
115-
test_reals = np.concatenate([np.ones(40), np.zeros(40)])
116-
test_shuffle = np.random.permutation(80)
117-
80+
# Sample data for a train and test set.
81+
# The model performs slightly better on the train set.
11882
probs_populations = {
119-
"Train": train_probs[train_shuffle],
120-
"Test": test_probs[test_shuffle]
83+
"Train": np.array([0.1, 0.9, 0.2, 0.8, 0.3, 0.7]),
84+
"Test": np.array([0.2, 0.8, 0.3, 0.7, 0.4, 0.6])
12185
}
12286
reals_populations = {
123-
"Train": train_reals[train_shuffle],
124-
"Test": test_reals[test_shuffle]
87+
"Train": np.array([0, 1, 0, 1, 0, 1]),
88+
"Test": np.array([0, 1, 0, 1, 0, 0]) # Note one outcome is different
12589
}
12690

12791
# Create a calibration curve to compare the model's performance

0 commit comments

Comments
 (0)