Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
scikit-learn
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Ian Johnson
scikit-learn
Commits
4efdd779
Commit
4efdd779
authored
14 years ago
by
Olivier Grisel
Browse files
Options
Downloads
Patches
Plain Diff
readd the dense version of the vectorizer
parent
3b9b8b46
No related branches found
No related tags found
No related merge requests found
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
scikits/learn/features/tests/test_text.py
+1
-1
1 addition, 1 deletion
scikits/learn/features/tests/test_text.py
scikits/learn/features/text.py
+96
-0
96 additions, 0 deletions
scikits/learn/features/text.py
with
97 additions
and
1 deletion
scikits/learn/features/tests/test_text.py
+
1
−
1
View file @
4efdd779
...
@@ -41,7 +41,7 @@ def test_simple_analyzer():
...
@@ -41,7 +41,7 @@ def test_simple_analyzer():
assert_equal
(
sa
.
analyze
(
text
),
expected
)
assert_equal
(
sa
.
analyze
(
text
),
expected
)
def
test_tf_idf
():
def
test_
dense_
tf_idf
():
hv
=
HashingVectorizer
(
dim
=
1000
,
probes
=
3
)
hv
=
HashingVectorizer
(
dim
=
1000
,
probes
=
3
)
# junk food documents
# junk food documents
...
...
This diff is collapsed.
Click to expand it.
scikits/learn/features/text.py
+
96
−
0
View file @
4efdd779
...
@@ -84,6 +84,102 @@ class SimpleAnalyzer(object):
...
@@ -84,6 +84,102 @@ class SimpleAnalyzer(object):
return
tokens
return
tokens
class
HashingVectorizer
(
object
):
"""
Compute term frequencies vectors using hashed term space
See the Hashing-trick related papers referenced by John Langford on this
page to get a grasp on the usefulness of this representation:
http://hunch.net/~jl/projects/hash_reps/index.html
dim is the number of buckets, higher dim means lower collision rate but
also higher memory requirements and higher processing times on the
resulting tfidf vectors.
Documents is a sequence of lists of tokens to initialize the DF estimates.
TODO handle bigrams in a smart way such as demonstrated here:
http://streamhacker.com/2010/05/24/text-classification-sentiment-analysis-stopwords-collocations/
"""
# TODO: implement me using the murmurhash that might be faster: but profile
# me first :)
# TODO: make it possible to select between the current dense representation
# and sparse alternatives from scipy.sparse once the liblinear and libsvm
# wrappers have been updated to be able to handle it efficiently
def
__init__
(
self
,
dim
=
5000
,
probes
=
1
,
analyzer
=
SimpleAnalyzer
(),
use_idf
=
True
):
self
.
dim
=
dim
self
.
probes
=
probes
self
.
analyzer
=
analyzer
self
.
use_idf
=
use_idf
# start counts at one to avoid zero division while
# computing IDF
self
.
df_counts
=
np
.
ones
(
dim
,
dtype
=
long
)
self
.
tf_vectors
=
None
self
.
sampled
=
0
def
hash_sign
(
self
,
token
,
probe
=
0
):
h
=
hash
(
token
+
(
probe
*
u
"
#
"
))
return
abs
(
h
)
%
self
.
dim
,
1.0
if
h
%
2
==
0
else
-
1.0
def
sample_document
(
self
,
text
,
tf_vector
=
None
,
update_estimates
=
True
):
"""
Extract features from text and update running freq estimates
"""
if
tf_vector
is
None
:
# allocate term frequency vector and stack to history
tf_vector
=
np
.
zeros
(
self
.
dim
,
np
.
float64
)
if
self
.
tf_vectors
is
None
:
self
.
tf_vectors
=
tf_vector
.
reshape
((
1
,
self
.
dim
))
else
:
self
.
tf_vectors
=
np
.
vstack
((
self
.
tf_vectors
,
tf_vector
))
tf_vector
=
self
.
tf_vectors
[
-
1
]
tokens
=
self
.
analyzer
.
analyze
(
text
)
for
token
in
tokens
:
# TODO add support for cooccurence tokens in a sentence
# window
for
probe
in
xrange
(
self
.
probes
):
i
,
incr
=
self
.
hash_sign
(
token
,
probe
)
tf_vector
[
i
]
+=
incr
tf_vector
/=
len
(
tokens
)
*
self
.
probes
if
update_estimates
and
self
.
use_idf
:
# update the running DF estimate
self
.
df_counts
+=
tf_vector
!=
0.0
self
.
sampled
+=
1
return
tf_vector
def
get_idf
(
self
):
return
np
.
log
(
float
(
self
.
sampled
)
/
self
.
df_counts
)
def
get_tfidf
(
self
):
"""
Compute the TF-log(IDF) vectors of the sampled documents
"""
if
self
.
tf_vectors
is
None
:
return
None
return
self
.
tf_vectors
*
self
.
get_idf
()
def
vectorize
(
self
,
document_filepaths
):
"""
Vectorize a batch of documents
"""
tf_vectors
=
np
.
zeros
((
len
(
document_filepaths
),
self
.
dim
))
for
i
,
filepath
in
enumerate
(
document_filepaths
):
self
.
sample_document
(
file
(
filepath
).
read
(),
tf_vectors
[
i
])
if
self
.
tf_vectors
is
None
:
self
.
tf_vectors
=
tf_vectors
else
:
self
.
tf_vectors
=
np
.
vstack
((
self
.
tf_vectors
,
tf_vectors
))
def
get_vectors
(
self
):
if
self
.
use_idf
:
return
self
.
get_tfidf
()
else
:
return
self
.
tf_vectors
class
SparseHashingVectorizer
(
object
):
class
SparseHashingVectorizer
(
object
):
"""
Compute term frequencies vectors using hashed term space in sparse matrix
"""
Compute term frequencies vectors using hashed term space in sparse matrix
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment