Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
scikit-learn
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Ian Johnson
scikit-learn
Commits
cba81c15
Commit
cba81c15
authored
14 years ago
by
Olivier Grisel
Browse files
Options
Downloads
Patches
Plain Diff
more work on document classification dataset loader
parent
ea20d1c3
Branches
Branches containing commit
Tags
Tags containing commit
No related merge requests found
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
scikits/learn/datasets/mlcomp.py
+26
-8
26 additions, 8 deletions
scikits/learn/datasets/mlcomp.py
scikits/learn/features/text.py
+31
-11
31 additions, 11 deletions
scikits/learn/features/text.py
with
57 additions
and
19 deletions
scikits/learn/datasets/mlcomp.py
+
26
−
8
View file @
cba81c15
...
...
@@ -4,11 +4,29 @@
import
os
from
scikits.learn.datasets.base
import
Bunch
from
scikits.learn.features.text
import
HashingVectorizer
def
load_document_classification
(
dataset_path
,
metadata
,
**
kw
):
return
Bunch
(
data
=
None
,
target
=
None
,
target_names
=
None
,
DESCR
=
metadata
.
get
(
'
description
'
))
def
load_document_classification
(
dataset_path
,
metadata
,
set_
,
**
kw
):
"""
Loader implementation for the DocumentClassification format
"""
target
=
[]
target_names
=
{}
vectorizer
=
kw
.
get
(
'
vectorizer
'
,
HashingVectorizer
())
dataset_path
=
os
.
path
.
join
(
dataset_path
,
set_
)
folders
=
[
f
for
f
in
sorted
(
os
.
listdir
(
dataset_path
))
if
os
.
path
.
isdir
(
os
.
path
.
join
(
dataset_path
,
f
))]
for
label
,
folder
in
enumerate
(
folders
):
target_names
[
label
]
=
folder
folder_path
=
os
.
path
.
join
(
dataset_path
,
folder
)
documents
=
[
os
.
path
.
join
(
folder_path
,
d
)
for
d
in
sorted
(
os
.
listdir
(
folder_path
))]
vectorizer
.
vectorize
(
documents
)
target
.
extend
(
len
(
documents
)
*
[
label
])
return
Bunch
(
data
=
vectorizer
.
get_vectors
(),
target
=
target
,
target_names
=
target_names
,
DESCR
=
metadata
.
get
(
'
description
'
))
LOADERS
=
{
'
DocumentClassification
'
:
load_document_classification
,
...
...
@@ -16,7 +34,7 @@ LOADERS = {
}
def
load_mlcomp
(
name_or_id
,
mlcomp_root
=
None
,
**
kwargs
):
def
load_mlcomp
(
name_or_id
,
mlcomp_root
=
None
,
set_
=
"
raw
"
,
**
kwargs
):
"""
Load a datasets as downloaded from http://mlcomp.org
Parameters
...
...
@@ -29,6 +47,8 @@ def load_mlcomp(name_or_id, mlcomp_root=None, **kwargs):
are stored, if mlcomp_root is None, the MLCOMP_DATASETS_HOME
environment variable is looked up instead.
set_ : select the portion to load:
'
train
'
,
'
test
'
or
'
raw
'
**kwargs : domain specific kwargs to be passed to the dataset loader.
Returns
...
...
@@ -60,7 +80,6 @@ def load_mlcomp(name_or_id, mlcomp_root=None, **kwargs):
if
not
os
.
path
.
exists
(
mlcomp_root
):
raise
ValueError
(
"
Could not find folder:
"
+
mlcomp_root
)
# dataset lookup
if
isinstance
(
name_or_id
,
int
):
# id lookup
...
...
@@ -95,9 +114,8 @@ def load_mlcomp(name_or_id, mlcomp_root=None, **kwargs):
loader
=
LOADERS
.
get
(
format
)
if
loader
is
None
:
raise
ValueError
(
"
No loader implemented for format:
"
+
format
)
return
loader
(
dataset_path
,
metadata
,
**
kwargs
)
return
loader
(
dataset_path
,
metadata
,
set_
=
set_
,
**
kwargs
)
if
__name__
==
"
__main__
"
:
print
load_mlcomp
(
'
20news-18828
'
)
print
load_mlcomp
(
379
)
twentynews
=
load_mlcomp
(
'
20news-18828
'
)
This diff is collapsed.
Click to expand it.
scikits/learn/features/text.py
+
31
−
11
View file @
cba81c15
...
...
@@ -26,9 +26,12 @@ class SimpleAnalyzer(object):
token_pattern
=
re
.
compile
(
r
"
\b\w\w+\b
"
,
re
.
U
)
def
__init__
(
self
,
default_charset
=
'
utf-8
'
):
self
.
charset
=
default_charset
def
analyze
(
self
,
text_document
):
if
isinstance
(
text_document
,
str
):
text_document
=
text_document
.
decode
(
"
utf-8
"
)
text_document
=
text_document
.
decode
(
self
.
charset
,
'
ignore
'
)
text_document
=
strip_accents
(
text_document
.
lower
())
return
re
.
findall
(
self
.
token_pattern
,
text_document
)
...
...
@@ -54,10 +57,12 @@ class HashingVectorizer(object):
# TODO: implement me using the murmurhash that might be faster: but profile
# me first :)
def
__init__
(
self
,
dim
=
5000
,
probes
=
3
,
analyzer
=
SimpleAnalyzer
()):
def
__init__
(
self
,
dim
=
5000
,
probes
=
3
,
analyzer
=
SimpleAnalyzer
(),
use_idf
=
True
):
self
.
dim
=
dim
self
.
probes
=
probes
self
.
analyzer
=
analyzer
self
.
use_idf
=
use_idf
# start counts at one to avoid zero division while
# computing IDF
...
...
@@ -89,24 +94,39 @@ class HashingVectorizer(object):
tf_vector
[
i
]
+=
incr
tf_vector
/=
len
(
tokens
)
*
self
.
probes
if
update_estimates
:
if
update_estimates
and
self
.
use_idf
:
# update the running DF estimate
self
.
df_counts
+=
tf_vector
!=
0.0
self
.
sampled
+=
1
return
tf_vector
def
get_idf
(
self
):
return
np
.
log
(
float
(
self
.
sampled
)
/
self
.
df_counts
)
def
get_tfidf
(
self
):
"""
Compute the TF-log(IDF) vectors of the sampled documents
"""
return
self
.
tf_vectors
*
np
.
log
(
float
(
self
.
sampled
)
/
self
.
df_counts
)
if
self
.
tf_vectors
is
None
:
return
None
return
self
.
tf_vectors
*
self
.
get_idf
()
def
vectorize
(
self
,
root_folder
):
"""
Scan a folder structure for text documents and estimate frequencies
def
vectorize
(
self
,
document_filepaths
):
"""
Vectorize a batch of documents
"""
tf_vectors
=
np
.
zeros
((
len
(
document_filepaths
),
self
.
dim
))
for
i
,
filepath
in
enumerate
(
document_filepaths
):
self
.
sample_document
(
file
(
filepath
).
read
(),
tf_vectors
[
i
])
if
self
.
tf_vectors
is
None
:
self
.
tf_vectors
=
tf_vectors
else
:
self
.
tf_vectors
=
np
.
vstack
((
self
.
tf_vectors
,
tf_vectors
))
def
get_vectors
(
self
):
if
self
.
use_idf
:
return
self
.
get_tfidf
()
else
:
return
self
.
tf_vectors
If this is a 2 level folder structure the first level is assumed to be
categories to be used as labels for supervised learning.
"""
# TODO: implement me!
pass
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment