xapian-core  2.0.0
cluster.h
Go to the documentation of this file.
1 
4 /* Copyright (C) 2010 Richard Boulton
5  * Copyright (C) 2016 Richhiey Thomas
6  * Copyright (C) 2018 Uppinder Chugh
7  * Copyright (C) 2024 Olly Betts
8  *
9  * This program is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU General Public License as
11  * published by the Free Software Foundation; either version 2 of the
12  * License, or (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; if not, see
21  * <https://www.gnu.org/licenses/>.
22  */
23 
24 #ifndef XAPIAN_INCLUDED_CLUSTER_H
25 #define XAPIAN_INCLUDED_CLUSTER_H
26 
27 #if !defined XAPIAN_IN_XAPIAN_H && !defined XAPIAN_LIB_BUILD
28 #error Never use <xapian/cluster.h> directly; include <xapian.h> instead.
29 #endif
30 
31 #include <xapian/attributes.h>
32 #include <xapian/mset.h>
33 #include <xapian/queryparser.h>
34 #include <xapian/types.h>
35 #include <xapian/visibility.h>
36 
37 #include <string_view>
38 #include <unordered_map>
39 #include <unordered_set>
40 #include <vector>
41 
42 namespace Xapian {
43 
49  public:
51  typedef enum {
52  STEM_NONE, STEM_SOME, STEM_ALL, STEM_ALL_Z, STEM_SOME_FULL_POS
53  } stem_strategy;
54 
60  explicit StemStopper(const Xapian::Stem &stemmer, stem_strategy strategy = STEM_SOME);
61 
62  std::string get_description() const override;
63 
64  bool operator()(const std::string& term) const override {
65  return stop_words.find(term) != stop_words.end();
66  }
67 
69  void add(std::string_view term);
70 
71  private:
73  std::unordered_set<std::string> stop_words;
75 };
76 
80  public:
81  class Internal;
84 
90  DocumentSet(const DocumentSet &other);
91 
98 
104 
110 
113 
116 
119 
122 
128  void add_document(const Document &document);
129 };
130 
138  void operator=(const FreqSource &) = delete;
139 
141  FreqSource(const FreqSource &) = delete;
142 
143  public:
146 
148  virtual ~FreqSource();
149 
154  virtual doccount get_termfreq(const std::string &tname) const = 0;
155 
157  virtual doccount get_doccount() const = 0;
158 
167  opt_intrusive_base::release();
168  return this;
169  }
170 
178  const FreqSource * release() const {
179  opt_intrusive_base::release();
180  return this;
181  }
182 };
183 
192  std::unordered_map<std::string, doccount> termfreq;
193 
196 
203  void add_document(const Document &document, const Stopper *stopper = NULL);
204 
205  public:
211  explicit TermListGroup(const MSet &docs, const Stopper *stopper = NULL);
212 
217  doccount get_termfreq(const std::string& tname) const override;
218 
219  doccount get_doccount() const override;
220 };
221 
226  protected:
230  std::unordered_map<std::string, double> weights;
231 
233  double magnitude = 0.0;
234 
242  void set_weight(std::string_view term, double weight) {
243  weights[std::string(term)] = weight;
244  }
245 
246  public:
249 
252 
254  TermIterator termlist_end() const noexcept {
255  return TermIterator(NULL);
256  }
257 
263  bool contains(std::string_view term) const {
264  return weights.find(std::string(term)) != weights.end();
265  }
266 
271  double get_weight(std::string_view term) const {
272  auto it = weights.find(std::string(term));
273  return (it == weights.end()) ? 0.0 : it->second;
274  }
275 
282  void add_weight(std::string_view term, double weight) {
283  weights[std::string(term)] += weight;
284  }
285 
287  double get_magnitude() const { return magnitude; }
288 
290  Xapian::termcount termlist_size() const { return weights.size(); }
291 
300  opt_intrusive_base::release();
301  return this;
302  }
303 
311  const PointType * release() const {
312  opt_intrusive_base::release();
313  return this;
314  }
315 };
316 
323 
324  public:
334  Point(const FreqSource& freqsource, const Document& document);
335 
337  Document get_document() const { return document; }
338 };
339 
343  public:
345  Centroid() { }
346 
353  explicit Centroid(const Point &point);
354 
361  void divide(double cluster_size);
362 
364  void clear() { weights.clear(); }
365 };
366 
371  public:
372  class Internal;
375 
381  Cluster(const Cluster &other);
382 
388  Cluster& operator=(const Cluster &other);
389 
394  Cluster(Cluster && other);
395 
400  Cluster & operator=(Cluster && other);
401 
407  explicit Cluster(const Centroid &centroid);
408 
411 
414 
417 
423  void add_point(const Point &point);
424 
426  void clear();
427 
430 
433 
435  const Centroid& get_centroid() const;
436 
441  void set_centroid(const Centroid &centroid);
442 
447  void recalculate();
448 };
449 
453  public:
454  class Internal;
457 
463  ClusterSet(const ClusterSet &other);
464 
471 
477 
483 
486 
489 
494  void add_cluster(const Cluster &cluster);
495 
503  void add_to_cluster(const Point &point, unsigned int index);
504 
507 
510 
513 
516 };
517 
521  public:
523  virtual ~Similarity();
524 
530  virtual double similarity(const PointType &a, const PointType &b) const = 0;
531 
533  virtual std::string get_description() const = 0;
534 };
535 
539  public:
543  double similarity(const PointType& a, const PointType& b) const override;
544 
546  std::string get_description() const override;
547 };
548 
553  public:
555  virtual ~Clusterer();
556 
563  virtual ClusterSet cluster(const MSet &mset) = 0;
564 
566  virtual std::string get_description() const = 0;
567 
576  opt_intrusive_base::release();
577  return this;
578  }
579 
587  const Clusterer * release() const {
588  opt_intrusive_base::release();
589  return this;
590  }
591 };
592 
598  std::vector<Point> points;
599 
601  unsigned int k;
602 
604  unsigned int max_iters;
605 
608 
616  void initialise_clusters(ClusterSet &cset, Xapian::doccount num_of_points);
617 
626  void initialise_points(const MSet &source);
627 
628  public:
635  explicit KMeans(unsigned int k_, unsigned int max_iters_ = 0);
636 
642  ClusterSet cluster(const MSet &mset) override;
643 
651  void set_stopper(const Xapian::Stopper* stop = NULL) { stopper = stop; }
652 
654  std::string get_description() const override;
655 };
656 
664  unsigned int k;
665 
666  public:
671  explicit LCDClusterer(unsigned int k_);
672 
678  ClusterSet cluster(const MSet &mset) override;
679 
681  std::string get_description() const override;
682 };
683 }
684 #endif // XAPIAN_INCLUDED_CLUSTER_H
Compiler attribute macros.
Class to represent cluster centroids in the vector space.
Definition: cluster.h:342
Centroid()
Default constructor.
Definition: cluster.h:345
Centroid(const Point &point)
Constructor with Point argument.
void clear()
Clear the terms and corresponding values of the centroid.
Definition: cluster.h:364
void divide(double cluster_size)
Divide the weight of terms in the centroid by 'size' and recalculate the magnitude.
Class for storing the results returned by the Clusterer.
Definition: cluster.h:452
ClusterSet(ClusterSet &&other)
Move constructor.
Xapian::doccount size() const
Return the number of clusters.
ClusterSet & operator=(ClusterSet &&other)
Move assignment operator.
ClusterSet(const ClusterSet &other)
Copying is allowed.
ClusterSet & operator=(const ClusterSet &other)
Assignment is allowed.
void add_to_cluster(const Point &point, unsigned int index)
Add the point to the cluster at position 'index'.
void add_cluster(const Cluster &cluster)
Add a cluster to the ClusterSet.
ClusterSet()
Default constructor.
const Cluster & operator[](Xapian::doccount i) const
Return the cluster at index 'i'.
void recalculate_centroids()
Recalculate the centroid for all the clusters in the ClusterSet.
~ClusterSet()
Destructor.
void clear_clusters()
Clear all the clusters in the ClusterSet.
Class to represents a Cluster which contains Points and Centroid of the Cluster.
Definition: cluster.h:370
Cluster()
Default constructor.
const Centroid & get_centroid() const
Return the current centroid of the cluster.
Cluster(const Centroid &centroid)
Constructor.
const Point & operator[](Xapian::doccount i) const
Return the point at the given index in the cluster.
DocumentSet get_documents() const
Return the documents that are contained within the cluster.
void recalculate()
Recalculate the centroid of the Cluster after each iteration of the KMeans algorithm by taking the me...
void clear()
Clear the cluster weights.
void set_centroid(const Centroid &centroid)
Set the centroid of the Cluster to 'centroid'.
~Cluster()
Destructor.
Cluster(const Cluster &other)
Copying is allowed.
Cluster & operator=(const Cluster &other)
Assignment is allowed.
Cluster(Cluster &&other)
Move constructor.
Cluster & operator=(Cluster &&other)
Move assignment operator.
Xapian::doccount size() const
Return size of the cluster.
void add_point(const Point &point)
Add a document to the Cluster.
Class representing an abstract class for a clusterer to be implemented.
Definition: cluster.h:552
virtual ~Clusterer()
Destructor.
virtual std::string get_description() const =0
Returns a string describing the clusterer being used.
virtual ClusterSet cluster(const MSet &mset)=0
Implement the required clustering algorithm in the subclass and and return clustered output as Cluste...
Clusterer * release()
Start reference counting this object.
Definition: cluster.h:575
const Clusterer * release() const
Start reference counting this object.
Definition: cluster.h:587
Class for calculating the cosine distance between two documents.
Definition: cluster.h:538
double similarity(const PointType &a, const PointType &b) const override
Calculates and returns the cosine similarity using the formula cos(theta) = a.b/(|a|*|b|)
std::string get_description() const override
Return a string describing this object.
Class representing a set of documents in a cluster.
Definition: cluster.h:79
DocumentSet & operator=(const DocumentSet &other)
Assignment is allowed.
DocumentSet(DocumentSet &&other)
Move constructor.
void add_document(const Document &document)
Add a new Document to the DocumentSet.
DocumentSet(const DocumentSet &other)
Copying is allowed.
Xapian::doccount size() const
Return the size of the DocumentSet.
const Xapian::Document & operator[](Xapian::doccount i) const
Return the Document in the DocumentSet at index i.
DocumentSet()
Default constructor.
~DocumentSet()
Destructor.
DocumentSet & operator=(DocumentSet &&other)
Move assignment operator.
Class representing a document.
Definition: document.h:64
Base class for TermListGroup Stores and provides terms that are contained in a document and their res...
Definition: cluster.h:136
virtual ~FreqSource()
Destructor.
FreqSource(const FreqSource &)=delete
Don't allow copying.
FreqSource * release()
Start reference counting this object.
Definition: cluster.h:166
virtual doccount get_termfreq(const std::string &tname) const =0
Return the term frequency of a particular term 'tname'.
virtual doccount get_doccount() const =0
Return the number of documents within the MSet.
const FreqSource * release() const
Start reference counting this object.
Definition: cluster.h:178
FreqSource()
Default constructor.
Definition: cluster.h:145
void operator=(const FreqSource &)=delete
Don't allow assignment.
Base class for objects managed by opt_intrusive_ptr.
Kmeans clusterer: This clusterer implements the K-Means clustering algorithm.
Definition: cluster.h:596
void set_stopper(const Xapian::Stopper *stop=NULL)
Set the Xapian::Stopper object to be used for identifying stopwords.
Definition: cluster.h:651
ClusterSet cluster(const MSet &mset) override
Implements the KMeans clustering algorithm.
KMeans(unsigned int k_, unsigned int max_iters_=0)
Constructor specifying number of clusters and maximum iterations.
unsigned int max_iters
Specifies the maximum number of iterations that KMeans will have.
Definition: cluster.h:604
Xapian::Internal::opt_intrusive_ptr< const Xapian::Stopper > stopper
Pointer to stopper object for identifying stopwords.
Definition: cluster.h:607
unsigned int k
Specifies that the clusterer needs to form 'k' clusters.
Definition: cluster.h:601
std::vector< Point > points
Contains the initialised points that are to be clustered.
Definition: cluster.h:598
void initialise_points(const MSet &source)
Initialise the Points to be fed into the Clusterer with the MSet object 'source'.
std::string get_description() const override
Return a string describing this object.
void initialise_clusters(ClusterSet &cset, Xapian::doccount num_of_points)
Initialise 'k' clusters by selecting 'k' centroids and assigning them to different clusters.
LCD clusterer: This clusterer implements the LCD clustering algorithm adapted from Modelling efficien...
Definition: cluster.h:662
LCDClusterer(unsigned int k_)
Constructor specifying number of clusters.
ClusterSet cluster(const MSet &mset) override
Implements the LCD clustering algorithm.
unsigned int k
Specifies that the clusterer needs to form 'k' clusters.
Definition: cluster.h:664
std::string get_description() const override
Return a string describing this object.
Class representing a list of search results.
Definition: mset.h:46
Abstract class representing a point in the VSM.
Definition: cluster.h:225
void set_weight(std::string_view term, double weight)
Set the weight 'weight' to the mapping of a term.
Definition: cluster.h:242
double get_magnitude() const
Return the pre-computed squared magnitude.
Definition: cluster.h:287
void add_weight(std::string_view term, double weight)
Add the weight 'weight' to the mapping of a term.
Definition: cluster.h:282
PointType * release()
Start reference counting this object.
Definition: cluster.h:299
std::unordered_map< std::string, double > weights
Implement a map to store the terms within a document and their pre-computed TF-IDF weights.
Definition: cluster.h:230
TermIterator termlist_begin() const
Return a TermIterator to the beginning of the termlist.
TermIterator termlist_end() const noexcept
Return a TermIterator to the end of the termlist.
Definition: cluster.h:254
double get_weight(std::string_view term) const
Return the TF-IDF weight associated with a certain term.
Definition: cluster.h:271
const PointType * release() const
Start reference counting this object.
Definition: cluster.h:311
Xapian::termcount termlist_size() const
Return the size of the termlist.
Definition: cluster.h:290
bool contains(std::string_view term) const
Validate whether a certain term exists in the termlist or not by performing a lookup operation in the...
Definition: cluster.h:263
PointType()
Default constructor.
Definition: cluster.h:248
Class to represent a document as a point in the Vector Space Model.
Definition: cluster.h:320
Point(const FreqSource &freqsource, const Document &document)
Constructor Initialise the point with terms and corresponding TF-IDF weights.
Document get_document() const
Returns the document corresponding to this Point.
Definition: cluster.h:337
Document document
The document which is being represented by the Point.
Definition: cluster.h:322
Base class for calculating the similarity between documents.
Definition: cluster.h:520
virtual ~Similarity()
Destructor.
virtual std::string get_description() const =0
Returns a string describing the similarity metric being used.
virtual double similarity(const PointType &a, const PointType &b) const =0
Calculates the similarity between the two documents.
Stopper subclass which checks for both stemmed and unstemmed stopwords.
Definition: cluster.h:48
std::string get_description() const override
Return a string describing this object.
Xapian::Stem stemmer
Definition: cluster.h:74
stem_strategy
Stemming strategies.
Definition: cluster.h:51
stem_strategy stem_action
Definition: cluster.h:72
void add(std::string_view term)
Add a single stop word and its stemmed equivalent.
StemStopper(const Xapian::Stem &stemmer, stem_strategy strategy=STEM_SOME)
Constructor.
std::unordered_set< std::string > stop_words
Definition: cluster.h:73
bool operator()(const std::string &term) const override
Is term a stop-word?
Definition: cluster.h:64
Class representing a stemming algorithm.
Definition: stem.h:74
Abstract base class for stop-word decision functor.
Definition: queryparser.h:50
Class for iterating over a list of terms.
Definition: termiterator.h:41
A class for construction of termlists which store the terms for a document along with the number of d...
Definition: cluster.h:188
doccount num_of_documents
Number of documents added to the termlist.
Definition: cluster.h:195
std::unordered_map< std::string, doccount > termfreq
Map of the terms and its corresponding term frequencies.
Definition: cluster.h:192
doccount get_termfreq(const std::string &tname) const override
Return the number of documents that the term 'tname' exists in.
void add_document(const Document &document, const Stopper *stopper=NULL)
Add a single document and calculates its corresponding term frequencies.
TermListGroup(const MSet &docs, const Stopper *stopper=NULL)
Constructor.
doccount get_doccount() const override
Return the number of documents within the MSet.
string term
Class representing a list of search results.
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:82
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:64
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:37
parsing a user query string to build a Xapian::Query object
static Xapian::Stem stemmer
Definition: stemtest.cc:42
typedefs for Xapian
Define XAPIAN_VISIBILITY_* macros.
#define XAPIAN_VISIBILITY_DEFAULT
Definition: visibility.h:28