24 #ifndef XAPIAN_INCLUDED_CLUSTER_H
25 #define XAPIAN_INCLUDED_CLUSTER_H
27 #if !defined XAPIAN_IN_XAPIAN_H && !defined XAPIAN_LIB_BUILD
28 #error Never use <xapian/cluster.h> directly; include <xapian.h> instead.
37 #include <string_view>
38 #include <unordered_map>
39 #include <unordered_set>
52 STEM_NONE, STEM_SOME,
STEM_ALL, STEM_ALL_Z, STEM_SOME_FULL_POS
65 return stop_words.find(
term) != stop_words.end();
167 opt_intrusive_base::release();
179 opt_intrusive_base::release();
192 std::unordered_map<std::string, doccount>
termfreq;
230 std::unordered_map<std::string, double>
weights;
233 double magnitude = 0.0;
243 weights[std::string(
term)] = weight;
264 return weights.find(std::string(
term)) != weights.end();
272 auto it = weights.find(std::string(
term));
273 return (it == weights.end()) ? 0.0 : it->second;
283 weights[std::string(
term)] += weight;
300 opt_intrusive_base::release();
312 opt_intrusive_base::release();
576 opt_intrusive_base::release();
588 opt_intrusive_base::release();
635 explicit KMeans(
unsigned int k_,
unsigned int max_iters_ = 0);
Compiler attribute macros.
Class to represent cluster centroids in the vector space.
Centroid()
Default constructor.
Centroid(const Point &point)
Constructor with Point argument.
void clear()
Clear the terms and corresponding values of the centroid.
void divide(double cluster_size)
Divide the weight of terms in the centroid by 'size' and recalculate the magnitude.
Class for storing the results returned by the Clusterer.
ClusterSet(ClusterSet &&other)
Move constructor.
Xapian::doccount size() const
Return the number of clusters.
ClusterSet & operator=(ClusterSet &&other)
Move assignment operator.
ClusterSet(const ClusterSet &other)
Copying is allowed.
ClusterSet & operator=(const ClusterSet &other)
Assignment is allowed.
void add_to_cluster(const Point &point, unsigned int index)
Add the point to the cluster at position 'index'.
void add_cluster(const Cluster &cluster)
Add a cluster to the ClusterSet.
ClusterSet()
Default constructor.
const Cluster & operator[](Xapian::doccount i) const
Return the cluster at index 'i'.
void recalculate_centroids()
Recalculate the centroid for all the clusters in the ClusterSet.
void clear_clusters()
Clear all the clusters in the ClusterSet.
Class to represents a Cluster which contains Points and Centroid of the Cluster.
Cluster()
Default constructor.
const Centroid & get_centroid() const
Return the current centroid of the cluster.
Cluster(const Centroid ¢roid)
Constructor.
const Point & operator[](Xapian::doccount i) const
Return the point at the given index in the cluster.
DocumentSet get_documents() const
Return the documents that are contained within the cluster.
void recalculate()
Recalculate the centroid of the Cluster after each iteration of the KMeans algorithm by taking the me...
void clear()
Clear the cluster weights.
void set_centroid(const Centroid ¢roid)
Set the centroid of the Cluster to 'centroid'.
Cluster(const Cluster &other)
Copying is allowed.
Cluster & operator=(const Cluster &other)
Assignment is allowed.
Cluster(Cluster &&other)
Move constructor.
Cluster & operator=(Cluster &&other)
Move assignment operator.
Xapian::doccount size() const
Return size of the cluster.
void add_point(const Point &point)
Add a document to the Cluster.
Class representing an abstract class for a clusterer to be implemented.
virtual ~Clusterer()
Destructor.
virtual std::string get_description() const =0
Returns a string describing the clusterer being used.
virtual ClusterSet cluster(const MSet &mset)=0
Implement the required clustering algorithm in the subclass and and return clustered output as Cluste...
Clusterer * release()
Start reference counting this object.
const Clusterer * release() const
Start reference counting this object.
Class for calculating the cosine distance between two documents.
double similarity(const PointType &a, const PointType &b) const override
Calculates and returns the cosine similarity using the formula cos(theta) = a.b/(|a|*|b|)
std::string get_description() const override
Return a string describing this object.
Class representing a set of documents in a cluster.
DocumentSet & operator=(const DocumentSet &other)
Assignment is allowed.
DocumentSet(DocumentSet &&other)
Move constructor.
void add_document(const Document &document)
Add a new Document to the DocumentSet.
DocumentSet(const DocumentSet &other)
Copying is allowed.
Xapian::doccount size() const
Return the size of the DocumentSet.
const Xapian::Document & operator[](Xapian::doccount i) const
Return the Document in the DocumentSet at index i.
DocumentSet()
Default constructor.
~DocumentSet()
Destructor.
DocumentSet & operator=(DocumentSet &&other)
Move assignment operator.
Class representing a document.
Base class for TermListGroup Stores and provides terms that are contained in a document and their res...
virtual ~FreqSource()
Destructor.
FreqSource(const FreqSource &)=delete
Don't allow copying.
FreqSource * release()
Start reference counting this object.
virtual doccount get_termfreq(const std::string &tname) const =0
Return the term frequency of a particular term 'tname'.
virtual doccount get_doccount() const =0
Return the number of documents within the MSet.
const FreqSource * release() const
Start reference counting this object.
FreqSource()
Default constructor.
void operator=(const FreqSource &)=delete
Don't allow assignment.
Base class for objects managed by opt_intrusive_ptr.
Kmeans clusterer: This clusterer implements the K-Means clustering algorithm.
void set_stopper(const Xapian::Stopper *stop=NULL)
Set the Xapian::Stopper object to be used for identifying stopwords.
ClusterSet cluster(const MSet &mset) override
Implements the KMeans clustering algorithm.
KMeans(unsigned int k_, unsigned int max_iters_=0)
Constructor specifying number of clusters and maximum iterations.
unsigned int max_iters
Specifies the maximum number of iterations that KMeans will have.
Xapian::Internal::opt_intrusive_ptr< const Xapian::Stopper > stopper
Pointer to stopper object for identifying stopwords.
unsigned int k
Specifies that the clusterer needs to form 'k' clusters.
std::vector< Point > points
Contains the initialised points that are to be clustered.
void initialise_points(const MSet &source)
Initialise the Points to be fed into the Clusterer with the MSet object 'source'.
std::string get_description() const override
Return a string describing this object.
void initialise_clusters(ClusterSet &cset, Xapian::doccount num_of_points)
Initialise 'k' clusters by selecting 'k' centroids and assigning them to different clusters.
LCD clusterer: This clusterer implements the LCD clustering algorithm adapted from Modelling efficien...
LCDClusterer(unsigned int k_)
Constructor specifying number of clusters.
ClusterSet cluster(const MSet &mset) override
Implements the LCD clustering algorithm.
unsigned int k
Specifies that the clusterer needs to form 'k' clusters.
std::string get_description() const override
Return a string describing this object.
Class representing a list of search results.
Abstract class representing a point in the VSM.
void set_weight(std::string_view term, double weight)
Set the weight 'weight' to the mapping of a term.
double get_magnitude() const
Return the pre-computed squared magnitude.
void add_weight(std::string_view term, double weight)
Add the weight 'weight' to the mapping of a term.
PointType * release()
Start reference counting this object.
std::unordered_map< std::string, double > weights
Implement a map to store the terms within a document and their pre-computed TF-IDF weights.
TermIterator termlist_begin() const
Return a TermIterator to the beginning of the termlist.
TermIterator termlist_end() const noexcept
Return a TermIterator to the end of the termlist.
double get_weight(std::string_view term) const
Return the TF-IDF weight associated with a certain term.
const PointType * release() const
Start reference counting this object.
Xapian::termcount termlist_size() const
Return the size of the termlist.
bool contains(std::string_view term) const
Validate whether a certain term exists in the termlist or not by performing a lookup operation in the...
PointType()
Default constructor.
Class to represent a document as a point in the Vector Space Model.
Point(const FreqSource &freqsource, const Document &document)
Constructor Initialise the point with terms and corresponding TF-IDF weights.
Document get_document() const
Returns the document corresponding to this Point.
Document document
The document which is being represented by the Point.
Base class for calculating the similarity between documents.
virtual ~Similarity()
Destructor.
virtual std::string get_description() const =0
Returns a string describing the similarity metric being used.
virtual double similarity(const PointType &a, const PointType &b) const =0
Calculates the similarity between the two documents.
Stopper subclass which checks for both stemmed and unstemmed stopwords.
std::string get_description() const override
Return a string describing this object.
stem_strategy
Stemming strategies.
stem_strategy stem_action
void add(std::string_view term)
Add a single stop word and its stemmed equivalent.
StemStopper(const Xapian::Stem &stemmer, stem_strategy strategy=STEM_SOME)
Constructor.
std::unordered_set< std::string > stop_words
bool operator()(const std::string &term) const override
Is term a stop-word?
Class representing a stemming algorithm.
Abstract base class for stop-word decision functor.
Class for iterating over a list of terms.
A class for construction of termlists which store the terms for a document along with the number of d...
doccount num_of_documents
Number of documents added to the termlist.
std::unordered_map< std::string, doccount > termfreq
Map of the terms and its corresponding term frequencies.
doccount get_termfreq(const std::string &tname) const override
Return the number of documents that the term 'tname' exists in.
void add_document(const Document &document, const Stopper *stopper=NULL)
Add a single document and calculates its corresponding term frequencies.
TermListGroup(const MSet &docs, const Stopper *stopper=NULL)
Constructor.
doccount get_doccount() const override
Return the number of documents within the MSet.
Class representing a list of search results.
The Xapian namespace contains public interfaces for the Xapian library.
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
parsing a user query string to build a Xapian::Query object
static Xapian::Stem stemmer
Define XAPIAN_VISIBILITY_* macros.
#define XAPIAN_VISIBILITY_DEFAULT