Merge branch 'iris'

christianversloot · christianversloot · commit 9d8b29f73a24 · 2020-02-03T08:42:03.000+01:00
diff --git a/MANIFEST b/MANIFEST
@@ -4,4 +4,5 @@ setup.py
 extra_keras_datasets\__init__.py
 extra_keras_datasets\emnist.py
 extra_keras_datasets\kmnist.py
+extra_keras_datasets\stl10.py
 extra_keras_datasets\svhn.py
diff --git a/README.md b/README.md
@@ -23,6 +23,7 @@ Hi there, and welcome to the `extra-keras-datasets` module! This extension to th
   * [SVHN-Normal](#svhn-normal)
   * [SVHN-Extra](#svhn-extra)
   * [STL-10](#stl-10)
+  * [Iris](#iris)
 - [Contributors and other references](#contributors-and-other-references)
 - [License](#license)
 
@@ -167,6 +168,20 @@ from extra-keras-datasets import stl10
 
 ---
 
+### Iris
+This is perhaps the best known database to be found in the pattern recognition literature. Fisher's paper is a classic in the field and is referenced frequently to this day. (See Duda & Hart, for example.) The data set contains 3 classes of 50 instances each, where each class refers to a type of iris plant. One class is linearly separable from the other 2; the latter are NOT linearly separable from each other.
+
+Predicted attribute: class of iris plant.
+
+```
+from extra-keras-datasets import iris
+(input_train, target_train), (input_test, target_test) = iris.load_data(test_split=0.2)
+```
+
+<a href="./assets/iris.png"><img src="./assets/iris.png" width="100%" style="border: 3px solid #f6f8fa;" /></a>
+
+---
+
 ## Contributors and other references
 * **EMNIST dataset:**
   * Cohen, G., Afshar, S., Tapson, J., & van Schaik, A. (2017). EMNIST: an extension of MNIST to handwritten letters. Retrieved from http://arxiv.org/abs/1702.05373
diff --git a/assets/iris.png b/assets/iris.png
diff --git a/extra_keras_datasets/__init__.py b/extra_keras_datasets/__init__.py
@@ -3,4 +3,5 @@
 from . import emnist
 from . import kmnist
 from . import svhn
-from . import stl10
+from . import stl10
+from . import iris
diff --git a/extra_keras_datasets/iris.py b/extra_keras_datasets/iris.py
@@ -0,0 +1,82 @@
+'''
+  Import the Iris dataset
+  Source: http://archive.ics.uci.edu/ml/datasets/Iris
+  Description: The data set contains 3 classes of 50 instances each, where each class refers to a type of iris plant.
+
+  ~~~ Important note ~~~
+  Please cite the following paper when using or referencing the dataset:
+  Fisher,R.A. "The use of multiple measurements in taxonomic problems" Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions to Mathematical Statistics" (John Wiley, NY, 1950).
+'''
+
+from keras.utils.data_utils import get_file
+import numpy as np
+import math
+
+def load_data(path='iris.npz', test_split=0.2):
+  '''Loads the Iris dataset.
+  # Arguments
+      path: path where to cache the dataset locally
+          (relative to ~/.keras/datasets).
+      test_split: percentage of data to use for testing (by default 20%)
+  # Returns
+      Tuple of Numpy arrays: `(input_train, target_train), (input_test, target_test)`.
+      Input structure: (sepal length, sepal width, petal length, petal width)
+      Target structure: 0 = iris setosa; 1 = iris versicolor; 2 = iris virginica.
+  '''
+  path = get_file(path,
+                    origin='http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data')
+
+  # Read data from file
+  f = open(path, 'r')
+  lines = f.readlines()
+  
+  # Process each line into input/target structure
+  samples = []
+  for line in lines:
+    sample = line_to_list(line)
+    if sample is not None:
+      samples.append(sample)
+  f.close()
+
+  # Randomly shuffle the data
+  np.random.shuffle(samples)
+
+  # Compute test_split in length
+  num_test_samples = math.floor(len(samples) * test_split)
+
+  # Split data
+  training_data = samples[num_test_samples:]
+  testing_data = samples[:num_test_samples]
+
+  # Split into inputs and targets
+  input_train = [i[0:4] for i in training_data]
+  input_test = [i[0:4] for i in testing_data]
+  target_train = [i[4] for i in training_data]
+  target_test = [i[4] for i in testing_data]
+
+  # Return data
+  return (input_train, target_train), (input_test, target_test)
+  
+def line_to_list(line):
+  '''
+    Convert a String-based line into a list with input and target data.
+  '''
+  elements = line.split(',')
+  if len(elements) > 1:
+    target = target_string_to_int(elements[4])
+    full_sample = [float(i) for i in elements[0:4]]
+    full_sample.append(target)
+    return tuple(full_sample)
+  else:
+    return None
+
+def target_string_to_int(target_value):
+  '''
+    Convert a String-based into an Integer-based target value.
+  '''
+  if target_value == 'Iris-setosa\n':
+    return 0
+  elif target_value == 'Iris-versicolor\n':
+    return 1
+  else:
+    return 2