diff --git a/scikits/learn/datasets/mlcomp.py b/scikits/learn/datasets/mlcomp.py
index 9e87eebb6434eebedbf2dd324e7c5d7a89cfd85e..bde4a7f1ffb10cec865e268e68723f7f1f1ebe4f 100644
--- a/scikits/learn/datasets/mlcomp.py
+++ b/scikits/learn/datasets/mlcomp.py
@@ -4,11 +4,29 @@
 
 import os
 from scikits.learn.datasets.base import Bunch
+from scikits.learn.features.text import HashingVectorizer
 
 
-def load_document_classification(dataset_path, metadata, **kw):
-    return Bunch(data=None, target=None, target_names=None,
-                 DESCR=metadata.get('description'))
+def load_document_classification(dataset_path, metadata, set_, **kw):
+    """Loader implementation for the DocumentClassification format"""
+    target = []
+    target_names = {}
+    vectorizer = kw.get('vectorizer', HashingVectorizer())
+
+    dataset_path = os.path.join(dataset_path, set_)
+    folders = [f for f in sorted(os.listdir(dataset_path))
+               if os.path.isdir(os.path.join(dataset_path, f))]
+    for label, folder in enumerate(folders):
+        target_names[label] = folder
+        folder_path = os.path.join(dataset_path, folder)
+        documents = [os.path.join(folder_path, d)
+                     for d in sorted(os.listdir(folder_path))]
+        vectorizer.vectorize(documents)
+        target.extend(len(documents) * [label])
+
+    return Bunch(data=vectorizer.get_vectors(), target=target,
+                 target_names=target_names, DESCR=metadata.get('description'))
+
 
 LOADERS = {
     'DocumentClassification': load_document_classification,
@@ -16,7 +34,7 @@ LOADERS = {
 }
 
 
-def load_mlcomp(name_or_id, mlcomp_root=None, **kwargs):
+def load_mlcomp(name_or_id, mlcomp_root=None, set_="raw", **kwargs):
     """Load a datasets as downloaded from http://mlcomp.org
 
     Parameters
@@ -29,6 +47,8 @@ def load_mlcomp(name_or_id, mlcomp_root=None, **kwargs):
                   are stored, if mlcomp_root is None, the MLCOMP_DATASETS_HOME
                   environment variable is looked up instead.
 
+    set_ : select the portion to load: 'train', 'test' or 'raw'
+
     **kwargs : domain specific kwargs to be passed to the dataset loader.
 
     Returns
@@ -60,7 +80,6 @@ def load_mlcomp(name_or_id, mlcomp_root=None, **kwargs):
     if not os.path.exists(mlcomp_root):
         raise ValueError("Could not find folder: " + mlcomp_root)
 
-
     # dataset lookup
     if isinstance(name_or_id, int):
         # id lookup
@@ -95,9 +114,8 @@ def load_mlcomp(name_or_id, mlcomp_root=None, **kwargs):
     loader = LOADERS.get(format)
     if loader is None:
         raise ValueError("No loader implemented for format: " + format)
-    return loader(dataset_path, metadata, **kwargs)
+    return loader(dataset_path, metadata, set_=set_, **kwargs)
 
 
 if __name__ == "__main__":
-    print load_mlcomp('20news-18828')
-    print load_mlcomp(379)
+    twentynews = load_mlcomp('20news-18828')
diff --git a/scikits/learn/features/text.py b/scikits/learn/features/text.py
index 41766e58e4e2d7f1f5fb2422de2766df9d40ba2e..e35e00b1d7a3d78329feaf90faa40e6a886e9156 100644
--- a/scikits/learn/features/text.py
+++ b/scikits/learn/features/text.py
@@ -26,9 +26,12 @@ class SimpleAnalyzer(object):
 
     token_pattern = re.compile(r"\b\w\w+\b", re.U)
 
+    def __init__(self, default_charset='utf-8'):
+        self.charset = default_charset
+
     def analyze(self, text_document):
         if isinstance(text_document, str):
-            text_document = text_document.decode("utf-8")
+            text_document = text_document.decode(self.charset, 'ignore')
         text_document = strip_accents(text_document.lower())
         return re.findall(self.token_pattern, text_document)
 
@@ -54,10 +57,12 @@ class HashingVectorizer(object):
     # TODO: implement me using the murmurhash that might be faster: but profile
     # me first :)
 
-    def __init__(self, dim=5000, probes=3, analyzer=SimpleAnalyzer()):
+    def __init__(self, dim=5000, probes=3, analyzer=SimpleAnalyzer(),
+                 use_idf=True):
         self.dim = dim
         self.probes = probes
         self.analyzer = analyzer
+        self.use_idf = use_idf
 
         # start counts at one to avoid zero division while
         # computing IDF
@@ -89,24 +94,39 @@ class HashingVectorizer(object):
                 tf_vector[i] += incr
         tf_vector /= len(tokens) * self.probes
 
-        if update_estimates:
+        if update_estimates and self.use_idf:
             # update the running DF estimate
             self.df_counts += tf_vector != 0.0
             self.sampled += 1
         return tf_vector
 
+    def get_idf(self):
+        return np.log(float(self.sampled) / self.df_counts)
+
     def get_tfidf(self):
         """Compute the TF-log(IDF) vectors of the sampled documents"""
-        return self.tf_vectors * np.log(float(self.sampled) / self.df_counts)
+        if self.tf_vectors is None:
+            return None
+        return self.tf_vectors * self.get_idf()
+
+    def vectorize(self, document_filepaths):
+        """Vectorize a batch of documents"""
+        tf_vectors = np.zeros((len(document_filepaths), self.dim))
+        for i, filepath in enumerate(document_filepaths):
+            self.sample_document(file(filepath).read(), tf_vectors[i])
+
+        if self.tf_vectors is None:
+            self.tf_vectors = tf_vectors
+        else:
+            self.tf_vectors = np.vstack((self.tf_vectors, tf_vectors))
+
 
-    def vectorize(self, root_folder):
-        """Scan a folder structure for text documents and estimate frequencies
+    def get_vectors(self):
+        if self.use_idf:
+            return self.get_tfidf()
+        else:
+            return self.tf_vectors
 
-        If this is a 2 level folder structure the first level is assumed to be
-        categories to be used as labels for supervised learning.
-        """
-        # TODO: implement me!
-        pass