From a82765a3b13d3f2e48b9ab9ae8e8187362c921ed Mon Sep 17 00:00:00 2001
From: Fabian Pedregosa <fabian.pedregosa@inria.fr>
Date: Wed, 6 Jan 2010 09:26:48 +0000
Subject: [PATCH] added a data set with missing entries.

From: dhuard <dhuard@cb17146a-f446-4be1-a4f7-bd7c5bb65646>

git-svn-id: https://scikit-learn.svn.sourceforge.net/svnroot/scikit-learn/trunk@247 22fbfee3-77ab-4535-9bad-27d1bd3bc7d8
---
 scikits/learn/datasets/nwis/COPYING          |  34 ++
 scikits/learn/datasets/nwis/__init__.py      |  10 +
 scikits/learn/datasets/nwis/data.py          |  98 +++++
 scikits/learn/datasets/nwis/nwis.py          | 137 +++++++
 scikits/learn/datasets/nwis/src/01423000.dly | 394 +++++++++++++++++++
 scikits/learn/datasets/nwis/src/convert.py   |  26 ++
 6 files changed, 699 insertions(+)
 create mode 100644 scikits/learn/datasets/nwis/COPYING
 create mode 100644 scikits/learn/datasets/nwis/__init__.py
 create mode 100644 scikits/learn/datasets/nwis/data.py
 create mode 100644 scikits/learn/datasets/nwis/nwis.py
 create mode 100644 scikits/learn/datasets/nwis/src/01423000.dly
 create mode 100644 scikits/learn/datasets/nwis/src/convert.py

diff --git a/scikits/learn/datasets/nwis/COPYING b/scikits/learn/datasets/nwis/COPYING
new file mode 100644
index 0000000000..b79acb272f
--- /dev/null
+++ b/scikits/learn/datasets/nwis/COPYING
@@ -0,0 +1,34 @@
+# The code and descriptive text is copyrighted and offered under the terms of
+# the BSD License from the authors; see below. However, the actual dataset may
+# have a different origin and intellectual property status. See the SOURCE and
+# COPYRIGHT variables for this information.
+
+# Copyright (c) 2007 David Huard <david.huard@gmail.com>
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+#       notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in
+#       the documentation and/or other materials provided with the
+#       distribution.
+#     * Neither the author nor the names of any contributors may be used
+#       to endorse or promote products derived from this software without
+#       specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/scikits/learn/datasets/nwis/__init__.py b/scikits/learn/datasets/nwis/__init__.py
new file mode 100644
index 0000000000..c305edefd3
--- /dev/null
+++ b/scikits/learn/datasets/nwis/__init__.py
@@ -0,0 +1,10 @@
+#! /usr/bin/env python
+# Last Change: Mon Jul 02 02:00 PM 2007 J
+import data as _nwis
+__doc__     = _nwis.DESCRSHORT
+copyright   = _nwis.COPYRIGHT
+source      = _nwis.SOURCE
+
+load        = _nwis.load
+
+all = ['load', 'copyright', 'source']
diff --git a/scikits/learn/datasets/nwis/data.py b/scikits/learn/datasets/nwis/data.py
new file mode 100644
index 0000000000..a2b0819c5d
--- /dev/null
+++ b/scikits/learn/datasets/nwis/data.py
@@ -0,0 +1,98 @@
+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+# Last Change: Wed Sep 19 2007
+
+# The code and descriptive text is copyrighted and offered under the terms of
+# the BSD License from the authors; see below. However, the actual dataset may
+# have a different origin and intellectual property status. See the SOURCE and
+# COPYRIGHT variables for this information.
+
+# Copyright (c) 2007 David Huard <david.huard@gmail.com>
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+#       notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in
+#       the documentation and/or other materials provided with the
+#       distribution.
+#     * Neither the author nor the names of any contributors may be used
+#       to endorse or promote products derived from this software without
+#       specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""USGS daily discharge at West Branch Delaware River at Walton NY."""
+
+__docformat__ = 'restructuredtext'
+
+COPYRIGHT   = """See src/01423000.dly """
+TITLE       = "Daily discharge at West Branch Delaware River at Walton, NY"
+SOURCE      = """Creator: National Water Information System, U.S. Geological Survey""" 
+
+DESCRSHORT  = """USGS daily discharge at West Branch Delaware River at Walton, NY."""
+
+DESCRLONG   = """Daily discharge (cubic feet per second) from 2006-9-18 to 2007-9-18 acquired by the 
+U.S. Geological Survey and released through the National Water Information System. The data code is included, describing the data quality: 'A' for approved and 'P' for provisional data. Missing discharges, or disharges affected by ice are indicated by the value -999.0. 
+"""
+
+NOTE        = """Discharge is computed by measuring the water level and estimating the related discharge through a rating curve. The rating curve is drawn using simultaneously measurements of water level and discharge. The rating curve typically changes over time due to erosion, presence of debris or ice. In this dataset, some discharges are missing due to the presence of ice."""
+
+def load():
+    """Return the nwis discharge data.
+    
+    :Returns:
+        d : dict
+            contains the following values:
+            - 'data' : a record array with the actual data
+    
+    Example
+    -------
+    Mask the missing data in discharge and plot the time series.
+    
+    >>> import datetime, pylab, numpy
+    >>> data = load()['data']
+    >>> discharge = numpy.ma.masked_values(data['discharge'], -999.0)
+    >>> datelist = zip(data['year'], data['month'], data['day'])
+    >>> date = [datetime.date(y,m,d) for y,m,d in datelist]
+    >>> pylab.plot_date(date, discharge, '-')
+    """
+    import numpy
+    from nwis import date, discharge, code
+    import re
+    
+    # Deal with missing data in discharge
+    discharge = numpy.array(discharge)
+    missing = (discharge == 'Ice') + (discharge == '')
+    discharge[missing] = -999.0
+    discharge = discharge.astype(numpy.float)    
+    
+    # Convert date string to ints
+    pattern = r'(\d{4})-(\d{2})-(\d{2})'
+    date = numpy.array([re.search(pattern, d).groups(1) for d in date]).astype(numpy.int)
+    year, month, day = date.transpose()
+        
+    data    = {}
+    format = [('year', numpy.int), ('month', numpy.int), ('day', numpy.int),
+              ('discharge', numpy.float), ('code', str)]
+    data['data'] = numpy.empty(len(discharge), format)
+    data['data']['year'] = year
+    data['data']['month'] = month
+    data['data']['day'] = day
+    data['data']['discharge'] = discharge   
+    return data
diff --git a/scikits/learn/datasets/nwis/nwis.py b/scikits/learn/datasets/nwis/nwis.py
new file mode 100644
index 0000000000..103281b580
--- /dev/null
+++ b/scikits/learn/datasets/nwis/nwis.py
@@ -0,0 +1,137 @@
+# Autogenerated by convert.py at Wed, 19 Sep 2007 15:53:31 +0000
+
+date = ['2006-09-18', '2006-09-19', '2006-09-20', '2006-09-21', '2006-09-22',
+'2006-09-23', '2006-09-24', '2006-09-25', '2006-09-26', '2006-09-27',
+'2006-09-28', '2006-09-29', '2006-09-30', '2006-10-01', '2006-10-02',
+'2006-10-03', '2006-10-04', '2006-10-05', '2006-10-06', '2006-10-07',
+'2006-10-08', '2006-10-09', '2006-10-10', '2006-10-11', '2006-10-12',
+'2006-10-13', '2006-10-14', '2006-10-15', '2006-10-16', '2006-10-17',
+'2006-10-18', '2006-10-19', '2006-10-20', '2006-10-21', '2006-10-22',
+'2006-10-23', '2006-10-24', '2006-10-25', '2006-10-26', '2006-10-27',
+'2006-10-28', '2006-10-29', '2006-10-30', '2006-10-31', '2006-11-01',
+'2006-11-02', '2006-11-03', '2006-11-04', '2006-11-05', '2006-11-06',
+'2006-11-07', '2006-11-08', '2006-11-09', '2006-11-10', '2006-11-11',
+'2006-11-12', '2006-11-13', '2006-11-14', '2006-11-15', '2006-11-16',
+'2006-11-17', '2006-11-18', '2006-11-19', '2006-11-20', '2006-11-21',
+'2006-11-22', '2006-11-23', '2006-11-24', '2006-11-25', '2006-11-26',
+'2006-11-27', '2006-11-28', '2006-11-29', '2006-11-30', '2006-12-01',
+'2006-12-02', '2006-12-03', '2006-12-04', '2006-12-05', '2006-12-06',
+'2006-12-07', '2006-12-08', '2006-12-09', '2006-12-10', '2006-12-11',
+'2006-12-12', '2006-12-13', '2006-12-14', '2006-12-15', '2006-12-16',
+'2006-12-17', '2006-12-18', '2006-12-19', '2006-12-20', '2006-12-21',
+'2006-12-22', '2006-12-23', '2006-12-24', '2006-12-25', '2006-12-26',
+'2006-12-27', '2006-12-28', '2006-12-29', '2006-12-30', '2006-12-31',
+'2007-01-01', '2007-01-02', '2007-01-03', '2007-01-04', '2007-01-05',
+'2007-01-06', '2007-01-07', '2007-01-08', '2007-01-09', '2007-01-10',
+'2007-01-11', '2007-01-12', '2007-01-13', '2007-01-14', '2007-01-15',
+'2007-01-16', '2007-01-17', '2007-01-18', '2007-01-19', '2007-01-20',
+'2007-01-21', '2007-01-22', '2007-01-23', '2007-01-24', '2007-01-25',
+'2007-01-26', '2007-01-27', '2007-01-28', '2007-01-29', '2007-01-30',
+'2007-01-31', '2007-02-01', '2007-02-02', '2007-02-03', '2007-02-04',
+'2007-02-05', '2007-02-06', '2007-02-07', '2007-02-08', '2007-02-09',
+'2007-02-10', '2007-02-11', '2007-02-12', '2007-02-13', '2007-02-14',
+'2007-02-15', '2007-02-16', '2007-02-17', '2007-02-18', '2007-02-19',
+'2007-02-20', '2007-02-21', '2007-02-22', '2007-02-23', '2007-02-24',
+'2007-02-25', '2007-02-26', '2007-02-27', '2007-02-28', '2007-03-01',
+'2007-03-02', '2007-03-03', '2007-03-04', '2007-03-05', '2007-03-06',
+'2007-03-07', '2007-03-08', '2007-03-09', '2007-03-10', '2007-03-11',
+'2007-03-12', '2007-03-13', '2007-03-14', '2007-03-15', '2007-03-16',
+'2007-03-17', '2007-03-18', '2007-03-19', '2007-03-20', '2007-03-21',
+'2007-03-22', '2007-03-23', '2007-03-24', '2007-03-25', '2007-03-26',
+'2007-03-27', '2007-03-28', '2007-03-29', '2007-03-30', '2007-03-31',
+'2007-04-01', '2007-04-02', '2007-04-03', '2007-04-04', '2007-04-05',
+'2007-04-06', '2007-04-07', '2007-04-08', '2007-04-09', '2007-04-10',
+'2007-04-11', '2007-04-12', '2007-04-13', '2007-04-14', '2007-04-15',
+'2007-04-16', '2007-04-17', '2007-04-18', '2007-04-19', '2007-04-20',
+'2007-04-21', '2007-04-22', '2007-04-23', '2007-04-24', '2007-04-25',
+'2007-04-26', '2007-04-27', '2007-04-28', '2007-04-29', '2007-04-30',
+'2007-05-01', '2007-05-02', '2007-05-03', '2007-05-04', '2007-05-05',
+'2007-05-06', '2007-05-07', '2007-05-08', '2007-05-09', '2007-05-10',
+'2007-05-11', '2007-05-12', '2007-05-13', '2007-05-14', '2007-05-15',
+'2007-05-16', '2007-05-17', '2007-05-18', '2007-05-19', '2007-05-20',
+'2007-05-21', '2007-05-22', '2007-05-23', '2007-05-24', '2007-05-25',
+'2007-05-26', '2007-05-27', '2007-05-28', '2007-05-29', '2007-05-30',
+'2007-05-31', '2007-06-01', '2007-06-02', '2007-06-03', '2007-06-04',
+'2007-06-05', '2007-06-06', '2007-06-07', '2007-06-08', '2007-06-09',
+'2007-06-10', '2007-06-11', '2007-06-12', '2007-06-13', '2007-06-14',
+'2007-06-15', '2007-06-16', '2007-06-17', '2007-06-18', '2007-06-19',
+'2007-06-20', '2007-06-21', '2007-06-22', '2007-06-23', '2007-06-24',
+'2007-06-25', '2007-06-26', '2007-06-27', '2007-06-28', '2007-06-29',
+'2007-06-30', '2007-07-01', '2007-07-02', '2007-07-03', '2007-07-04',
+'2007-07-05', '2007-07-06', '2007-07-07', '2007-07-08', '2007-07-09',
+'2007-07-10', '2007-07-11', '2007-07-12', '2007-07-13', '2007-07-14',
+'2007-07-15', '2007-07-16', '2007-07-17', '2007-07-18', '2007-07-19',
+'2007-07-20', '2007-07-21', '2007-07-22', '2007-07-23', '2007-07-24',
+'2007-07-25', '2007-07-26', '2007-07-27', '2007-07-28', '2007-07-29',
+'2007-07-30', '2007-07-31', '2007-08-01', '2007-08-02', '2007-08-03',
+'2007-08-04', '2007-08-05', '2007-08-06', '2007-08-07', '2007-08-08',
+'2007-08-09', '2007-08-10', '2007-08-11', '2007-08-12', '2007-08-13',
+'2007-08-14', '2007-08-15', '2007-08-16', '2007-08-17', '2007-08-18',
+'2007-08-19', '2007-08-20', '2007-08-21', '2007-08-22', '2007-08-23',
+'2007-08-24', '2007-08-25', '2007-08-26', '2007-08-27', '2007-08-28',
+'2007-08-29', '2007-08-30', '2007-08-31', '2007-09-01', '2007-09-02',
+'2007-09-03', '2007-09-04', '2007-09-05', '2007-09-06', '2007-09-07',
+'2007-09-08', '2007-09-09', '2007-09-10', '2007-09-11', '2007-09-12',
+'2007-09-13', '2007-09-14', '2007-09-15', '2007-09-16', '2007-09-17',
+'2007-09-18']
+
+discharge = ['468', '430', '412', '375', '331', '358', '372', '328', '291',
+'261', '258', '1230', '933', '1040', '1130', '920', '840', '822', '690', '602',
+'537', '486', '441', '405', '402', '397', '351', '319', '294', '303', '399',
+'354', '1190', '1770', '1260', '1090', '945', '898', '839', '713', '2790',
+'3960', '2460', '1880', '1510', '1480', '1340', '1030', '885', '789', '712',
+'838', '1560', '1090', '987', '930', '970', '863', '806', '1510', '3990',
+'2260', '1800', '1500', '1240', '1060', '1260', '1260', '995', '917', '848',
+'786', '721', '670', '757', '1260', '869', '808', '758', '692', '677', '617',
+'553', '533', '507', '489', '496', '503', '450', '436', '407', '387', '366',
+'346', '337', '326', '500', '503', '425', '647', '745', '634', '624', '601',
+'575', '672', '740', '630', '587', '599', '1560', '1340', '2020', '1840',
+'1520', '1280', '1140', '1270', '1320', '2100', '2780', '1970', '1600', '1460',
+'1240', '973', '913', '817', '729', '642', '464', 'Ice', 'Ice', 'Ice', 'Ice',
+'Ice', 'Ice', 'Ice', 'Ice', 'Ice', 'Ice', 'Ice', 'Ice', 'Ice', 'Ice', 'Ice',
+'Ice', 'Ice', 'Ice', 'Ice', 'Ice', 'Ice', 'Ice', 'Ice', 'Ice', 'Ice', 'Ice',
+'Ice', 'Ice', 'Ice', 'Ice', 'Ice', 'Ice', 'Ice', 'Ice', 'Ice', 'Ice', 'Ice',
+'Ice', 'Ice', 'Ice', 'Ice', 'Ice', 'Ice', 'Ice', 'Ice', 'Ice', '1680', '6580',
+'4550', '2730', '1980', '1500', '1280', '1030', '1330', '3530', '2690', '3910',
+'3250', '4330', '4090', '3080', '2310', '1850', '1550', '1480', '1270', '1300',
+'1440', '1180', '1020', '914', '808', '727', '656', '812', '1070', '839',
+'1370', '3980', '4180', '3880', '3810', '3620', '2830', '2290', '1920', '1730',
+'1500', '1620', '1220', '1140', '1050', '909', '780', '822', '704', '605',
+'547', '497', '453', '417', '387', '423', '800', '602', '463', '411', '380',
+'396', '458', '370', '344', '442', '407', '325', '287', '263', '240', '221',
+'211', '241', '215', '181', '166', '164', '203', '230', '323', '325', '255',
+'196', '170', '155', '139', '129', '123', '119', '112', '104', '100', '120',
+'104', '106', '377', '217', '205', '165', '140', '126', '116', '116', '147',
+'146', '116', '105', '97', '91', '97', '134', '156', '135', '124', '123',
+'177', '144', '150', '131', '110', '100', '97', '93', '96', '116', '316',
+'245', '159', '145', '207', '174', '139', '144', '240', '231', '164', '140',
+'125', '112', '114', '110', '95', '87', '85', '118', '121', '489', '488',
+'261', '201', '171', '150', '155', '256', '188', '163', '143', '158', '174',
+'148', '138', '141', '123', '112', '103', '95', '89', '84', '79', '74', '70',
+'67', '63', '60', '58', '68', '135', '124', '983', '1220', '542', '397', '',
+'625', '465', '392']
+
+code = ['A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'P',
+'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P',
+'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P',
+'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P',
+'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P',
+'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P',
+'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P',
+'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P',
+'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P',
+'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P',
+'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P',
+'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P',
+'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P',
+'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P',
+'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P',
+'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P',
+'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P',
+'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P',
+'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P',
+'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P',
+'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P',
+'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P',
+'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', '', 'P', 'P', 'P']
+
diff --git a/scikits/learn/datasets/nwis/src/01423000.dly b/scikits/learn/datasets/nwis/src/01423000.dly
new file mode 100644
index 0000000000..04e433ed5d
--- /dev/null
+++ b/scikits/learn/datasets/nwis/src/01423000.dly
@@ -0,0 +1,394 @@
+# ---------------------------------- WARNING ----------------------------------------
+# The data you have obtained from this automated U.S. Geological Survey database
+# have not received Director's approval and as such are provisional and subject to
+# revision.  The data are released on the condition that neither the USGS nor the
+# United States Government may be held liable for any damages resulting from its use.
+# Additional info: http://waterdata.usgs.gov/nwis/help/?provisional
+#
+# File-format description:  http://waterdata.usgs.gov/nwis/?tab_delimited_format_info
+# Automated-retrieval info: http://waterdata.usgs.gov/nwis/?automated_retrieval_info
+#
+# Contact:   gs-w_support_nwisweb@usgs.gov
+# retrieved: 2007-09-19 10:31:26 EDT
+#
+# Data for the following site(s) are contained in this file
+#    USGS 01423000 WEST BRANCH DELAWARE RIVER AT WALTON NY
+# -----------------------------------------------------------------------------------
+#
+# Data provided for site 01423000
+#    DD parameter statistic   Description
+#    01   00060     00003     Discharge, cubic feet per second (Mean)
+#
+# Data-value qualification codes included in this output: 
+#     Ice  Ice affected  
+#     A  Approved for publication -- Processing and review completed.  
+#     P  Provisional data subject to revision.  
+# 
+agency_cd	site_no	datetime	01_00060_00003	01_00060_00003_cd
+5s	15s	16s	14s	14s
+USGS	01423000	2006-09-18	468	A
+USGS	01423000	2006-09-19	430	A
+USGS	01423000	2006-09-20	412	A
+USGS	01423000	2006-09-21	375	A
+USGS	01423000	2006-09-22	331	A
+USGS	01423000	2006-09-23	358	A
+USGS	01423000	2006-09-24	372	A
+USGS	01423000	2006-09-25	328	A
+USGS	01423000	2006-09-26	291	A
+USGS	01423000	2006-09-27	261	A
+USGS	01423000	2006-09-28	258	A
+USGS	01423000	2006-09-29	1230	A
+USGS	01423000	2006-09-30	933	A
+USGS	01423000	2006-10-01	1040	P
+USGS	01423000	2006-10-02	1130	P
+USGS	01423000	2006-10-03	920	P
+USGS	01423000	2006-10-04	840	P
+USGS	01423000	2006-10-05	822	P
+USGS	01423000	2006-10-06	690	P
+USGS	01423000	2006-10-07	602	P
+USGS	01423000	2006-10-08	537	P
+USGS	01423000	2006-10-09	486	P
+USGS	01423000	2006-10-10	441	P
+USGS	01423000	2006-10-11	405	P
+USGS	01423000	2006-10-12	402	P
+USGS	01423000	2006-10-13	397	P
+USGS	01423000	2006-10-14	351	P
+USGS	01423000	2006-10-15	319	P
+USGS	01423000	2006-10-16	294	P
+USGS	01423000	2006-10-17	303	P
+USGS	01423000	2006-10-18	399	P
+USGS	01423000	2006-10-19	354	P
+USGS	01423000	2006-10-20	1190	P
+USGS	01423000	2006-10-21	1770	P
+USGS	01423000	2006-10-22	1260	P
+USGS	01423000	2006-10-23	1090	P
+USGS	01423000	2006-10-24	945	P
+USGS	01423000	2006-10-25	898	P
+USGS	01423000	2006-10-26	839	P
+USGS	01423000	2006-10-27	713	P
+USGS	01423000	2006-10-28	2790	P
+USGS	01423000	2006-10-29	3960	P
+USGS	01423000	2006-10-30	2460	P
+USGS	01423000	2006-10-31	1880	P
+USGS	01423000	2006-11-01	1510	P
+USGS	01423000	2006-11-02	1480	P
+USGS	01423000	2006-11-03	1340	P
+USGS	01423000	2006-11-04	1030	P
+USGS	01423000	2006-11-05	885	P
+USGS	01423000	2006-11-06	789	P
+USGS	01423000	2006-11-07	712	P
+USGS	01423000	2006-11-08	838	P
+USGS	01423000	2006-11-09	1560	P
+USGS	01423000	2006-11-10	1090	P
+USGS	01423000	2006-11-11	987	P
+USGS	01423000	2006-11-12	930	P
+USGS	01423000	2006-11-13	970	P
+USGS	01423000	2006-11-14	863	P
+USGS	01423000	2006-11-15	806	P
+USGS	01423000	2006-11-16	1510	P
+USGS	01423000	2006-11-17	3990	P
+USGS	01423000	2006-11-18	2260	P
+USGS	01423000	2006-11-19	1800	P
+USGS	01423000	2006-11-20	1500	P
+USGS	01423000	2006-11-21	1240	P
+USGS	01423000	2006-11-22	1060	P
+USGS	01423000	2006-11-23	1260	P
+USGS	01423000	2006-11-24	1260	P
+USGS	01423000	2006-11-25	995	P
+USGS	01423000	2006-11-26	917	P
+USGS	01423000	2006-11-27	848	P
+USGS	01423000	2006-11-28	786	P
+USGS	01423000	2006-11-29	721	P
+USGS	01423000	2006-11-30	670	P
+USGS	01423000	2006-12-01	757	P
+USGS	01423000	2006-12-02	1260	P
+USGS	01423000	2006-12-03	869	P
+USGS	01423000	2006-12-04	808	P
+USGS	01423000	2006-12-05	758	P
+USGS	01423000	2006-12-06	692	P
+USGS	01423000	2006-12-07	677	P
+USGS	01423000	2006-12-08	617	P
+USGS	01423000	2006-12-09	553	P
+USGS	01423000	2006-12-10	533	P
+USGS	01423000	2006-12-11	507	P
+USGS	01423000	2006-12-12	489	P
+USGS	01423000	2006-12-13	496	P
+USGS	01423000	2006-12-14	503	P
+USGS	01423000	2006-12-15	450	P
+USGS	01423000	2006-12-16	436	P
+USGS	01423000	2006-12-17	407	P
+USGS	01423000	2006-12-18	387	P
+USGS	01423000	2006-12-19	366	P
+USGS	01423000	2006-12-20	346	P
+USGS	01423000	2006-12-21	337	P
+USGS	01423000	2006-12-22	326	P
+USGS	01423000	2006-12-23	500	P
+USGS	01423000	2006-12-24	503	P
+USGS	01423000	2006-12-25	425	P
+USGS	01423000	2006-12-26	647	P
+USGS	01423000	2006-12-27	745	P
+USGS	01423000	2006-12-28	634	P
+USGS	01423000	2006-12-29	624	P
+USGS	01423000	2006-12-30	601	P
+USGS	01423000	2006-12-31	575	P
+USGS	01423000	2007-01-01	672	P
+USGS	01423000	2007-01-02	740	P
+USGS	01423000	2007-01-03	630	P
+USGS	01423000	2007-01-04	587	P
+USGS	01423000	2007-01-05	599	P
+USGS	01423000	2007-01-06	1560	P
+USGS	01423000	2007-01-07	1340	P
+USGS	01423000	2007-01-08	2020	P
+USGS	01423000	2007-01-09	1840	P
+USGS	01423000	2007-01-10	1520	P
+USGS	01423000	2007-01-11	1280	P
+USGS	01423000	2007-01-12	1140	P
+USGS	01423000	2007-01-13	1270	P
+USGS	01423000	2007-01-14	1320	P
+USGS	01423000	2007-01-15	2100	P
+USGS	01423000	2007-01-16	2780	P
+USGS	01423000	2007-01-17	1970	P
+USGS	01423000	2007-01-18	1600	P
+USGS	01423000	2007-01-19	1460	P
+USGS	01423000	2007-01-20	1240	P
+USGS	01423000	2007-01-21	973	P
+USGS	01423000	2007-01-22	913	P
+USGS	01423000	2007-01-23	817	P
+USGS	01423000	2007-01-24	729	P
+USGS	01423000	2007-01-25	642	P
+USGS	01423000	2007-01-26	464	P
+USGS	01423000	2007-01-27	Ice	P
+USGS	01423000	2007-01-28	Ice	P
+USGS	01423000	2007-01-29	Ice	P
+USGS	01423000	2007-01-30	Ice	P
+USGS	01423000	2007-01-31	Ice	P
+USGS	01423000	2007-02-01	Ice	P
+USGS	01423000	2007-02-02	Ice	P
+USGS	01423000	2007-02-03	Ice	P
+USGS	01423000	2007-02-04	Ice	P
+USGS	01423000	2007-02-05	Ice	P
+USGS	01423000	2007-02-06	Ice	P
+USGS	01423000	2007-02-07	Ice	P
+USGS	01423000	2007-02-08	Ice	P
+USGS	01423000	2007-02-09	Ice	P
+USGS	01423000	2007-02-10	Ice	P
+USGS	01423000	2007-02-11	Ice	P
+USGS	01423000	2007-02-12	Ice	P
+USGS	01423000	2007-02-13	Ice	P
+USGS	01423000	2007-02-14	Ice	P
+USGS	01423000	2007-02-15	Ice	P
+USGS	01423000	2007-02-16	Ice	P
+USGS	01423000	2007-02-17	Ice	P
+USGS	01423000	2007-02-18	Ice	P
+USGS	01423000	2007-02-19	Ice	P
+USGS	01423000	2007-02-20	Ice	P
+USGS	01423000	2007-02-21	Ice	P
+USGS	01423000	2007-02-22	Ice	P
+USGS	01423000	2007-02-23	Ice	P
+USGS	01423000	2007-02-24	Ice	P
+USGS	01423000	2007-02-25	Ice	P
+USGS	01423000	2007-02-26	Ice	P
+USGS	01423000	2007-02-27	Ice	P
+USGS	01423000	2007-02-28	Ice	P
+USGS	01423000	2007-03-01	Ice	P
+USGS	01423000	2007-03-02	Ice	P
+USGS	01423000	2007-03-03	Ice	P
+USGS	01423000	2007-03-04	Ice	P
+USGS	01423000	2007-03-05	Ice	P
+USGS	01423000	2007-03-06	Ice	P
+USGS	01423000	2007-03-07	Ice	P
+USGS	01423000	2007-03-08	Ice	P
+USGS	01423000	2007-03-09	Ice	P
+USGS	01423000	2007-03-10	Ice	P
+USGS	01423000	2007-03-11	Ice	P
+USGS	01423000	2007-03-12	Ice	P
+USGS	01423000	2007-03-13	Ice	P
+USGS	01423000	2007-03-14	1680	P
+USGS	01423000	2007-03-15	6580	P
+USGS	01423000	2007-03-16	4550	P
+USGS	01423000	2007-03-17	2730	P
+USGS	01423000	2007-03-18	1980	P
+USGS	01423000	2007-03-19	1500	P
+USGS	01423000	2007-03-20	1280	P
+USGS	01423000	2007-03-21	1030	P
+USGS	01423000	2007-03-22	1330	P
+USGS	01423000	2007-03-23	3530	P
+USGS	01423000	2007-03-24	2690	P
+USGS	01423000	2007-03-25	3910	P
+USGS	01423000	2007-03-26	3250	P
+USGS	01423000	2007-03-27	4330	P
+USGS	01423000	2007-03-28	4090	P
+USGS	01423000	2007-03-29	3080	P
+USGS	01423000	2007-03-30	2310	P
+USGS	01423000	2007-03-31	1850	P
+USGS	01423000	2007-04-01	1550	P
+USGS	01423000	2007-04-02	1480	P
+USGS	01423000	2007-04-03	1270	P
+USGS	01423000	2007-04-04	1300	P
+USGS	01423000	2007-04-05	1440	P
+USGS	01423000	2007-04-06	1180	P
+USGS	01423000	2007-04-07	1020	P
+USGS	01423000	2007-04-08	914	P
+USGS	01423000	2007-04-09	808	P
+USGS	01423000	2007-04-10	727	P
+USGS	01423000	2007-04-11	656	P
+USGS	01423000	2007-04-12	812	P
+USGS	01423000	2007-04-13	1070	P
+USGS	01423000	2007-04-14	839	P
+USGS	01423000	2007-04-15	1370	P
+USGS	01423000	2007-04-16	3980	P
+USGS	01423000	2007-04-17	4180	P
+USGS	01423000	2007-04-18	3880	P
+USGS	01423000	2007-04-19	3810	P
+USGS	01423000	2007-04-20	3620	P
+USGS	01423000	2007-04-21	2830	P
+USGS	01423000	2007-04-22	2290	P
+USGS	01423000	2007-04-23	1920	P
+USGS	01423000	2007-04-24	1730	P
+USGS	01423000	2007-04-25	1500	P
+USGS	01423000	2007-04-26	1620	P
+USGS	01423000	2007-04-27	1220	P
+USGS	01423000	2007-04-28	1140	P
+USGS	01423000	2007-04-29	1050	P
+USGS	01423000	2007-04-30	909	P
+USGS	01423000	2007-05-01	780	P
+USGS	01423000	2007-05-02	822	P
+USGS	01423000	2007-05-03	704	P
+USGS	01423000	2007-05-04	605	P
+USGS	01423000	2007-05-05	547	P
+USGS	01423000	2007-05-06	497	P
+USGS	01423000	2007-05-07	453	P
+USGS	01423000	2007-05-08	417	P
+USGS	01423000	2007-05-09	387	P
+USGS	01423000	2007-05-10	423	P
+USGS	01423000	2007-05-11	800	P
+USGS	01423000	2007-05-12	602	P
+USGS	01423000	2007-05-13	463	P
+USGS	01423000	2007-05-14	411	P
+USGS	01423000	2007-05-15	380	P
+USGS	01423000	2007-05-16	396	P
+USGS	01423000	2007-05-17	458	P
+USGS	01423000	2007-05-18	370	P
+USGS	01423000	2007-05-19	344	P
+USGS	01423000	2007-05-20	442	P
+USGS	01423000	2007-05-21	407	P
+USGS	01423000	2007-05-22	325	P
+USGS	01423000	2007-05-23	287	P
+USGS	01423000	2007-05-24	263	P
+USGS	01423000	2007-05-25	240	P
+USGS	01423000	2007-05-26	221	P
+USGS	01423000	2007-05-27	211	P
+USGS	01423000	2007-05-28	241	P
+USGS	01423000	2007-05-29	215	P
+USGS	01423000	2007-05-30	181	P
+USGS	01423000	2007-05-31	166	P
+USGS	01423000	2007-06-01	164	P
+USGS	01423000	2007-06-02	203	P
+USGS	01423000	2007-06-03	230	P
+USGS	01423000	2007-06-04	323	P
+USGS	01423000	2007-06-05	325	P
+USGS	01423000	2007-06-06	255	P
+USGS	01423000	2007-06-07	196	P
+USGS	01423000	2007-06-08	170	P
+USGS	01423000	2007-06-09	155	P
+USGS	01423000	2007-06-10	139	P
+USGS	01423000	2007-06-11	129	P
+USGS	01423000	2007-06-12	123	P
+USGS	01423000	2007-06-13	119	P
+USGS	01423000	2007-06-14	112	P
+USGS	01423000	2007-06-15	104	P
+USGS	01423000	2007-06-16	100	P
+USGS	01423000	2007-06-17	120	P
+USGS	01423000	2007-06-18	104	P
+USGS	01423000	2007-06-19	106	P
+USGS	01423000	2007-06-20	377	P
+USGS	01423000	2007-06-21	217	P
+USGS	01423000	2007-06-22	205	P
+USGS	01423000	2007-06-23	165	P
+USGS	01423000	2007-06-24	140	P
+USGS	01423000	2007-06-25	126	P
+USGS	01423000	2007-06-26	116	P
+USGS	01423000	2007-06-27	116	P
+USGS	01423000	2007-06-28	147	P
+USGS	01423000	2007-06-29	146	P
+USGS	01423000	2007-06-30	116	P
+USGS	01423000	2007-07-01	105	P
+USGS	01423000	2007-07-02	97	P
+USGS	01423000	2007-07-03	91	P
+USGS	01423000	2007-07-04	97	P
+USGS	01423000	2007-07-05	134	P
+USGS	01423000	2007-07-06	156	P
+USGS	01423000	2007-07-07	135	P
+USGS	01423000	2007-07-08	124	P
+USGS	01423000	2007-07-09	123	P
+USGS	01423000	2007-07-10	177	P
+USGS	01423000	2007-07-11	144	P
+USGS	01423000	2007-07-12	150	P
+USGS	01423000	2007-07-13	131	P
+USGS	01423000	2007-07-14	110	P
+USGS	01423000	2007-07-15	100	P
+USGS	01423000	2007-07-16	97	P
+USGS	01423000	2007-07-17	93	P
+USGS	01423000	2007-07-18	96	P
+USGS	01423000	2007-07-19	116	P
+USGS	01423000	2007-07-20	316	P
+USGS	01423000	2007-07-21	245	P
+USGS	01423000	2007-07-22	159	P
+USGS	01423000	2007-07-23	145	P
+USGS	01423000	2007-07-24	207	P
+USGS	01423000	2007-07-25	174	P
+USGS	01423000	2007-07-26	139	P
+USGS	01423000	2007-07-27	144	P
+USGS	01423000	2007-07-28	240	P
+USGS	01423000	2007-07-29	231	P
+USGS	01423000	2007-07-30	164	P
+USGS	01423000	2007-07-31	140	P
+USGS	01423000	2007-08-01	125	P
+USGS	01423000	2007-08-02	112	P
+USGS	01423000	2007-08-03	114	P
+USGS	01423000	2007-08-04	110	P
+USGS	01423000	2007-08-05	95	P
+USGS	01423000	2007-08-06	87	P
+USGS	01423000	2007-08-07	85	P
+USGS	01423000	2007-08-08	118	P
+USGS	01423000	2007-08-09	121	P
+USGS	01423000	2007-08-10	489	P
+USGS	01423000	2007-08-11	488	P
+USGS	01423000	2007-08-12	261	P
+USGS	01423000	2007-08-13	201	P
+USGS	01423000	2007-08-14	171	P
+USGS	01423000	2007-08-15	150	P
+USGS	01423000	2007-08-16	155	P
+USGS	01423000	2007-08-17	256	P
+USGS	01423000	2007-08-18	188	P
+USGS	01423000	2007-08-19	163	P
+USGS	01423000	2007-08-20	143	P
+USGS	01423000	2007-08-21	158	P
+USGS	01423000	2007-08-22	174	P
+USGS	01423000	2007-08-23	148	P
+USGS	01423000	2007-08-24	138	P
+USGS	01423000	2007-08-25	141	P
+USGS	01423000	2007-08-26	123	P
+USGS	01423000	2007-08-27	112	P
+USGS	01423000	2007-08-28	103	P
+USGS	01423000	2007-08-29	95	P
+USGS	01423000	2007-08-30	89	P
+USGS	01423000	2007-08-31	84	P
+USGS	01423000	2007-09-01	79	P
+USGS	01423000	2007-09-02	74	P
+USGS	01423000	2007-09-03	70	P
+USGS	01423000	2007-09-04	67	P
+USGS	01423000	2007-09-05	63	P
+USGS	01423000	2007-09-06	60	P
+USGS	01423000	2007-09-07	58	P
+USGS	01423000	2007-09-08	68	P
+USGS	01423000	2007-09-09	135	P
+USGS	01423000	2007-09-10	124	P
+USGS	01423000	2007-09-11	983	P
+USGS	01423000	2007-09-12	1220	P
+USGS	01423000	2007-09-13	542	P
+USGS	01423000	2007-09-14	397	P
+USGS	01423000	2007-09-15		
+USGS	01423000	2007-09-16	625	P
+USGS	01423000	2007-09-17	465	P
+USGS	01423000	2007-09-18	392	P
diff --git a/scikits/learn/datasets/nwis/src/convert.py b/scikits/learn/datasets/nwis/src/convert.py
new file mode 100644
index 0000000000..3efa78cb00
--- /dev/null
+++ b/scikits/learn/datasets/nwis/src/convert.py
@@ -0,0 +1,26 @@
+import numpy, csv, time
+from scikits.learn.datasets.misc import dumpvar
+
+
+f = open('01423000.dly', 'r')
+reader = csv.reader(f)
+reader.dialect.delimiter='\t'
+
+n_comments = 28
+for i in range(n_comments):
+	reader.next()
+
+LABELS = ['agency', 'station', 'date', 'discharge', 'code']
+
+table = numpy.array([a for a in reader])
+
+AGENCY, STATION, DATE, DISCHARGE, CODE = table.transpose()
+
+# Write the data in nwis.py
+a = open("../nwis.py", "w")
+a.write('# Autogenerated by convert.py at %s\n\n' % 
+        time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()))
+
+for i in range(2,5):
+	a.writelines(dumpvar(list(table[:,i]), LABELS[i]))
+a.close()
-- 
GitLab