Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
scikit-learn
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Ian Johnson
scikit-learn
Commits
0cbd220b
Commit
0cbd220b
authored
14 years ago
by
Olivier Grisel
Browse files
Options
Downloads
Patches
Plain Diff
work in progress on vocabulary dimension restriction
parent
e7c11d96
Branches
Branches containing commit
Tags
Tags containing commit
No related merge requests found
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
scikits/learn/feature_extraction/text/dense.py
+74
-17
74 additions, 17 deletions
scikits/learn/feature_extraction/text/dense.py
with
74 additions
and
17 deletions
scikits/learn/feature_extraction/text/dense.py
+
74
−
17
View file @
0cbd220b
...
...
@@ -4,7 +4,7 @@
# License: BSD Style.
"""
Utilities to build feature vectors from text documents
"""
from
collections
import
defaultdict
from
operator
import
itemgetter
import
re
import
unicodedata
import
numpy
as
np
...
...
@@ -186,43 +186,100 @@ class BaseCountVectorizer(BaseEstimator):
vocabulary: dict, optional
A dictionary where keys are tokens and values are indices in the
matrix.
This is useful in order to fix the vocabulary in advance.
max_df : float in range [0.0, 1.0], optional, 0.5 by default
When building the vocabulary ignore terms that have a term frequency
high than the given threshold (corpus specific stop words).
This parameter is ignored if vocabulary is not None.
max_features : optional, None by default
If not None, build a vocabulary that only consider the top
max_features ordered by term frequency across the corpus.
This parameter is ignored if vocabulary is not None.
dtype: type, optional
Type of the matrix returned by fit_transform() or transform().
"""
def
__init__
(
self
,
analyzer
=
DEFAULT_ANALYZER
,
vocabulary
=
{},
dtype
=
long
):
def
__init__
(
self
,
analyzer
=
DEFAULT_ANALYZER
,
vocabulary
=
{},
max_df
=
0.5
,
max_features
=
None
,
dtype
=
long
):
self
.
analyzer
=
analyzer
self
.
vocabulary
=
vocabulary
self
.
dtype
=
dtype
self
.
max_df
=
max_df
self
.
max_features
=
max_features
def
_init_matrix
(
self
,
shape
):
raise
NotImplementedError
def
_build_vectors_and_vocab
(
self
,
raw_documents
):
vocab
=
{}
# token => idx
docs
=
[]
# result of document conversion to term_count_dict
term_counts_per_doc
=
[]
term_counts
=
{}
for
doc
in
raw_documents
:
doc_dict
=
{}
# idx => count
# term counts across entire corpus (count each term maximum once per
# document)
document_counts
=
{}
for
token
in
self
.
analyzer
.
analyze
(
doc
):
if
not
token
in
vocab
:
vocab
[
token
]
=
len
(
vocab
)
idx
=
vocab
[
token
]
doc_dict
[
idx
]
=
doc_dict
.
get
(
idx
,
0
)
+
1
max_df
=
self
.
max_df
max_features
=
self
.
max_features
docs
.
append
(
doc_dict
)
for
doc
in
raw_documents
:
term_count_dict
=
{}
# term => count in doc
for
term
in
self
.
analyzer
.
analyze
(
doc
):
term_count_dict
[
term
]
=
term_count_dict
.
get
(
term
,
0
)
+
1
term_counts
[
term
]
=
term_counts
.
get
(
term
,
0
)
+
1
if
max_df
is
not
None
:
for
term
in
term_count_dict
.
iterkeys
():
document_counts
[
term
]
=
document_counts
.
get
(
term
,
0
)
+
1
term_counts_per_doc
.
append
(
term_count_dict
)
n_doc
=
len
(
term_counts_per_doc
)
# filter out stop words: terms that occur in almost all documents
stop_words
=
set
()
if
max_df
is
not
None
:
max_document_count
=
max_df
*
n_doc
for
t
,
dc
in
sorted
(
document_counts
.
iteritems
(),
key
=
itemgetter
(
1
),
reverse
=
True
):
if
dc
<
max_document_count
:
break
stop_words
.
add
(
t
)
# list the terms that should be part of the vocabulary
if
max_features
is
not
None
:
# extract the most frequent terms for the vocabulary
terms
=
set
()
for
t
,
tc
in
sorted
(
term_counts
.
iteritems
(),
key
=
itemgetter
(
1
),
reverse
=
True
):
if
t
not
in
stop_words
:
terms
.
add
(
t
)
if
len
(
terms
)
>=
max_features
:
break
else
:
terms
=
set
(
term_counts
.
iteritems
())
terms
-=
stop_words
# convert to a document-token matrix
matrix
=
self
.
_init_matrix
((
len
(
docs
),
len
(
vocab
)))
vocabulary
=
dict
(((
t
,
i
)
for
i
,
t
in
enumerate
(
terms
)))
# token => idx
# find the indices of the tokens
matrix
=
self
.
_init_matrix
((
n_doc
,
len
(
vocabulary
)))
for
i
,
doc_dict
in
enumerate
(
docs
):
for
idx
,
count
in
doc_dict
.
iteritems
():
for
i
,
term_count_dict
in
enumerate
(
term_counts_per_doc
):
for
term
,
count
in
term_count_dict
.
iteritems
():
idx
=
vocabulary
.
get
(
term
)
if
idx
is
not
None
:
matrix
[
i
,
idx
]
=
count
return
matrix
,
vocab
return
matrix
,
vocab
ulary
def
_build_vectors
(
self
,
raw_documents
):
# raw_documents is an iterable so we don't know its size in advance
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment