libcrn  3.9.5
A document image processing library
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
CRNUnivariateGaussianMixture.cpp
Go to the documentation of this file.
1 /* Copyright 2008-2016 INSA-Lyon, CoReNum, ENS-Lyon
2  *
3  * This file is part of libcrn.
4  *
5  * libcrn is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser General Public License as published by
7  * the Free Software Foundation, either version 3 of the License, or
8  * (at your option) any later version.
9  *
10  * libcrn is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU Lesser General Public License for more details.
14  *
15  * You should have received a copy of the GNU Lesser General Public License
16  * along with libcrn. If not, see <http://www.gnu.org/licenses/>.
17  *
18  * file: CRNUnivariateGaussianMixture.cpp
19  * \author Jean DUONG, Yann LEYDIER
20  */
21 
23 #include <CRNException.h>
26 #include <CRNData/CRNDataFactory.h>
27 #include <CRNi18n.h>
28 
29 #include <math.h>
30 #include <stdio.h>
31 #include <iostream>
32 
33 using namespace crn;
34 
39 {
40  members.clear();
41 }
42 
52 {
53  if (!isValidMemberIndex(k))
54  {
55  throw ExceptionDomain(StringUTF8("UnivariateGaussianMixture::GetMember(size_t k): ") +
56  _("index out of bounds."));
57  }
58 
59  return members[k].first;
60 }
61 
72 {
73  if (!isValidMemberIndex(k))
74  {
75  throw ExceptionDomain(StringUTF8("UnivariateGaussianMixture::GetWeight(size_t k): ") +
76  _("index out of bounds."));
77  }
78 
79  return members[k].second;
80 }
81 
91 double UnivariateGaussianMixture::GetMean(size_t k) const
92 {
93  if (!isValidMemberIndex(k))
94  {
95  throw ExceptionDomain(StringUTF8("UnivariateGaussianMixture::GetMean(size_t k): ") +
96  _("index out of bounds."));
97  }
98 
99  return members[k].first.GetMean();
100 }
101 
112 {
113  if (!isValidMemberIndex(k))
114  {
115  throw ExceptionDomain(StringUTF8("UnivariateGaussianMixture::GetVariance(size_t k): ") +
116  _("index out of bounds."));
117  }
118 
119  return members[k].first.GetVariance();
120 }
121 
129 {
130  members.push_back(std::make_pair(pdf, w));
131 }
132 
143 {
144  if (!isValidMemberIndex(k))
145  {
146  throw ExceptionDomain(StringUTF8("UnivariateGaussianMixture::SetMember("
147  "UnivariateGaussianPDF PDF, double w, size_t k): ") +
148  _("index out of bounds."));
149  }
150  members[k] = std::make_pair(pdf, w);
151 }
152 
154 {
155  pair_comp_mean(bool r):reverse_flag(r){}
156 
157  inline bool operator()(const std::pair<UnivariateGaussianPDF, double> &e1, const std::pair<UnivariateGaussianPDF, double> &e2) const
158  {
159  if (reverse_flag)
160  return e1.first.GetMean() > e2.first.GetMean();
161  else
162  return e1.first.GetMean() < e2.first.GetMean();
163  }
164 
165  private:
166  bool reverse_flag;
167 };
168 
170 {
171  pair_comp_variance(bool r):reverse_flag(r){}
172 
173  inline bool operator()(const std::pair<UnivariateGaussianPDF, double> &e1, const std::pair<UnivariateGaussianPDF, double> &e2) const
174  {
175  if (reverse_flag)
176  return e1.first.GetVariance() > e2.first.GetVariance();
177  else
178  return e1.first.GetVariance() < e2.first.GetVariance();
179  }
180 
181  private:
182  bool reverse_flag;
183 };
184 
186 {
187  pair_comp_weight(bool r):reverse_flag(r){}
188 
189  inline bool operator()(const std::pair<UnivariateGaussianPDF, double> &e1, const std::pair<UnivariateGaussianPDF, double> &e2) const
190  {
191  if (reverse_flag)
192  return e1.second > e2.second;
193  else
194  return e1.second < e2.second;
195  }
196 
197 private:
198  bool reverse_flag;
199 };
200 
205 {
206  std::sort(members.begin(), members.end(), pair_comp_mean(reverse));
207 }
208 
213 {
214  std::sort(members.begin(), members.end(), pair_comp_variance(reverse));
215 }
216 
221 {
222  std::sort(members.begin(), members.end(), pair_comp_weight(reverse));
223 }
224 
232 double UnivariateGaussianMixture::ValueAt(const double x) const
233 {
234  double d = 0.0;
235 
236  for (auto & elem : members)
237  d += elem.second * elem.first.ValueAt(x);
238 
239  return d;
240 }
241 
252 double UnivariateGaussianMixture::ValueAt(const double x, size_t k, bool weight_flag) const
253 {
254  if (!isValidMemberIndex(k))
255  throw ExceptionDomain(StringUTF8("UnivariateGaussianMixture::ValueAt(const double x, size_t k): ") +
256  _("index out of bounds."));
257 
258  double val = members[k].first.ValueAt(x);
259 
260  if (weight_flag)
261  val *= members[k].second;
262 
263  return val;
264 }
265 
274 {
275  size_t nbPatterns = data.GetRows();
276  double e = 0.0;
277 
278  for (size_t k = 0; k < nbPatterns; ++k)
279  e += log(ValueAt(data.At(k, 0)));
280 
281  return e;
282 }
283 
291 double UnivariateGaussianMixture::MLLE(const std::vector<double> &data) const
292 {
293  size_t nbPatterns = data.size();
294  double e = 0.0;
295 
296  for (size_t k = 0; k < nbPatterns; ++k)
297  e += log(ValueAt(data[k]));
298 
299  return e;
300 }
301 
312 unsigned int UnivariateGaussianMixture::EM(const MatrixDouble& patterns, size_t nbSeeds, double epsilon, size_t maximalIterations)
313 {
314  unsigned int nbIterations = 0;
315  size_t nbMembers = size_t(nbSeeds);
316  size_t nbPatterns = patterns.GetRows();
317 
319  // Setup //
321 
322  members.clear();
323 
324  double min_value = patterns.GetMin();
325  double max_value = patterns.GetMax();
326  double delta = (max_value - min_value) / double(nbSeeds);
327  double seed = min_value + delta / 2.0;
328 
329  // Heuristic setup : gaussians are placed equidistant from Min to Max value
330  // variance set to catch at least one data value
331 
332  for (size_t k = 0; k < nbMembers; ++k)
333  {
334  // Warning : if data distribution is too sparse, variance should be adapted
335 
336  bool redo = true;
337  double var = delta;
338  double sigma = sqrt(var);
339  size_t counter = 0;
340 
341  while (redo)
342  {
343  double v = patterns[counter][0];
344 
345  redo = (fabs(seed - v) > sigma);
346  ++counter;
347 
348  if ((counter >= nbPatterns) && redo)
349  {
350  var += delta;
351  sigma = sqrt(var);
352  counter = 0;
353  }
354  }
355 
356  AddMember(UnivariateGaussianPDF(seed, var), 1.0);
357 
358  seed += delta;
359  }
360 
361  MatrixDouble Proba(nbPatterns, nbMembers, 0.0);
362 
363  bool Continue = true;
364  double likelihood = 0.0;
365 
367  // Iterations //
369 
370  while(Continue)
371  {
372  // Expectation
373 
374  for (size_t i = 0; i < nbPatterns; ++i)
375  {
376  double xi = patterns[i][0];
377  double Pi = 0.0;
378 
379  for (size_t k = 0; k < nbMembers; ++k)
380  {
381  double pik = this->ValueAt(xi, k);
382 
383  Proba[i][k] = pik;
384  Pi += pik;
385  }
386 
387  Proba.MultRow(i, 1.0 / Pi);
388  }
389 
390  // Maximization
391 
392  for (size_t k = 0; k < nbMembers; ++k)
393  {
394  UnivariateGaussianPDF Gk = members[k].first;
395  double CumulPk = 0.0;
396  double muk = Gk.GetMean();
397 
398  double mu = 0.0;
399  double v = 0.0;
400 
401  for (size_t i = 0; i < nbPatterns; ++i)
402  {
403  double pik = Proba[i][k];
404  double xi = patterns[i][0];
405 
406  CumulPk += pik;
407  mu += pik * xi;
408  v += pik * Sqr(xi - muk);
409  }
410 
411  // Numeric limit reached. Recomputation needed
412  if (std::isinf(mu) || std::isinf(v))
413  {
414  mu = 0.0;
415  v = 0.0;
416 
417  for (size_t i = 0; i < nbPatterns; ++i)
418  {
419  double pik = Proba[i][k];
420  double xi = patterns[i][0];
421 
422  mu += pik * xi / CumulPk;
423  v += pik * Sqr(xi - muk) / CumulPk;
424  }
425  }
426  else
427  {
428  mu /= CumulPk;
429  v /= CumulPk;
430  }
431 
432  members[k].second = double(CumulPk) / double(nbPatterns);
433  members[k].first = UnivariateGaussianPDF(mu, v);
434  }
435 
436  double NewLikelihood = MLLE(patterns);
437  double LikelihoodDiff = fabs(NewLikelihood - likelihood);
438 
439  likelihood = NewLikelihood;
440  nbIterations++;
441 
442  Continue = ((nbIterations < maximalIterations) && (LikelihoodDiff > epsilon));
443  }
444 
445  return nbIterations;
446 }
447 
458 unsigned int UnivariateGaussianMixture::EM(const std::vector<double> &patterns, size_t nbSeeds, double epsilon, size_t maximalIterations)
459 {
460  unsigned int nbIterations = 0;
461  size_t nbMembers = size_t(nbSeeds);
462  size_t nbPatterns = patterns.size();
463  auto spatterns = patterns;
464 
466  // Setup //
468 
469  members.clear();
470  std::sort(spatterns.begin(), spatterns.end());
471 
472  double min_value = spatterns.front();
473  double max_value = spatterns.back();
474  double delta = (max_value - min_value) / double(nbSeeds);
475  double seed = min_value + delta / 2.0;
476 
477  // Heuristic setup : gaussians are placed equidistant from Min to Max value
478  // variance set to catch at least one data value
479 
480  for (size_t k = 0; k < nbMembers; ++k)
481  {
482  // Warning : if data distribution is too sparse, variance should be adapted
483 
484  bool redo = true;
485  double var = delta;
486  double sigma = sqrt(var);
487  size_t counter = 0;
488 
489  while (redo)
490  {
491  double v = spatterns[counter];
492 
493  redo = (fabs(seed - v) > sigma);
494  ++counter;
495 
496  if ((counter >= nbPatterns) && redo)
497  {
498  var += delta;
499  sigma = sqrt(var);
500  counter = 0;
501  }
502  }
503 
504  AddMember(UnivariateGaussianPDF(seed, var), 1.0);
505  seed += delta;
506  }
507 
508  MatrixDouble Proba(nbPatterns, nbMembers, 0.0);
509 
510  bool Continue = true;
511  double likelihood = 0.0;
512 
514  // Iterations //
516 
517  while(Continue)
518  {
519  // Expectation
520 
521  for (size_t i = 0; i < nbPatterns; ++i)
522  {
523  double xi = spatterns[i];
524  double Pi = 0.0;
525 
526  for (size_t k = 0; k < nbMembers; ++k)
527  {
528  double pik = this->ValueAt(xi, k);
529 
530  Proba[i][k] = pik;
531  Pi += pik;
532  }
533 
534  Proba.MultRow(i, 1.0 / Pi);
535  }
536 
537  // Maximization
538 
539  for (size_t k = 0; k < nbMembers; ++k)
540  {
541  UnivariateGaussianPDF Gk = members[k].first;
542  double CumulPk = 0.0;
543  double muk = Gk.GetMean();
544 
545  double mu = 0.0;
546  double v = 0.0;
547 
548  for (size_t i = 0; i < nbPatterns; ++i)
549  {
550  double pik = Proba[i][k];
551  double xi = spatterns[i];
552 
553  CumulPk += pik;
554  mu += pik * xi;
555  v += pik * Sqr(xi - muk);
556  }
557 
558  if (std::isfinite(mu) && std::isfinite(v))
559  {
560  mu /= CumulPk;
561  v /= CumulPk;
562  }
563  else // Numeric limit reached. Recomputation needed
564  {
565  mu = 0.0;
566  v = 0.0;
567 
568  for (size_t i = 0; i < nbPatterns; ++i)
569  {
570  double pik = Proba[i][k];
571  double xi = spatterns[i];
572 
573  mu += pik * xi / CumulPk;
574  v += pik * Sqr(xi - muk) / CumulPk;
575  }
576  }
577 
578  members[k].second = double(CumulPk) / double(nbPatterns);
579  members[k].first = UnivariateGaussianPDF(mu, v);
580  }
581 
582  double NewLikelihood = MLLE(spatterns);
583  double LikelihoodDiff = fabs(NewLikelihood - likelihood);
584 
585  likelihood = NewLikelihood;
586  nbIterations++;
587 
588  Continue = ((nbIterations < maximalIterations) && (LikelihoodDiff > epsilon));
589  }
590 
591  return nbIterations;
592 }
593 
594 /*****************************************************************************/
603 std::vector<double> UnivariateGaussianMixture::MakeRandomSample(size_t n, size_t m, bool reseed) const
604 {
605  size_t nbPDF = GetNbMembers();
606 
607  auto patterns = std::vector<double>();
608 
609  std::vector<int> pop;
610  std::vector<double> cumul_weights;
611  std::vector<double> index_sample;
612 
613  // Create uniform sample to choose indexes
614  // This should become a function !!!
615 
616  if (reseed)
617  {
618  srand((unsigned int)time(nullptr));
619  }
620 
621  for (size_t k = 0; k < n; ++k)
622  {
623  index_sample.push_back(((double)(rand())) / ((double)(RAND_MAX)));
624  }
625 
626  // Cumulative weights from the mixture
627 
628  double mass = 0.0;
629 
630  for (size_t k = 0; k < nbPDF; k++)
631  {
632  mass += GetWeight(k);
633 
634  cumul_weights.push_back(mass);
635  }
636 
637  for (size_t k = 0; k < nbPDF; ++k)
638  {
639  cumul_weights[k] /= mass;
640  pop.push_back(0);
641  }
642 
643  // Build indexes to indicate which PDF of the mixture is used to generate each pattern
644 
645  for (size_t k = 0; k < n; k++)
646  {
647  double d = index_sample[k];
648  bool continue_flag = true;
649  size_t Id = 0;
650 
651  while (continue_flag)
652  {
653  if ((cumul_weights[Id] >= d) || (Id == nbPDF - 1))
654  {
655  continue_flag = false;
656  }
657  else
658  {
659  ++Id;
660  }
661  }
662 
663  ++pop[Id];
664  }
665 
666  // Create patterns
667 
668  for (size_t p = 0; p < nbPDF; ++p)
669  {
670  int subsample_pop = pop[p];
671 
672  auto subsample = GetMember(p).MakeRandomSample(subsample_pop, m, reseed);
673 
674  for (int r = 0; r < subsample_pop; ++r)
675  {
676  patterns.push_back(subsample[r]);
677  }
678  }
679 
680  return patterns;
681 }
682 
689 {
690  String s;
691 
692  for (size_t k = 0; k < members.size(); ++k)
693  {
694  s += ToString(k) + U"\n";
695  }
696 
697  return s;
698 }
699 
708 {
709  String s;
710 
711  if (isValidMemberIndex(k))
712  {
713  s += U"Member : ";
714  s += k;
715  s += U" [0..";
716  s += members.size() - 1;
717  s += U"]\nWeight = ";
718  s += members[k].second;
719  s += U"\nMean = ";
720  s += (members[k].first).GetMean();
721  s += U"\nVariance = ";
722  s += (members[k].first).GetVariance();
723  }
724 
725  return s;
726 }
727 
740 {
741  if (el.GetName() != "UnivariateGaussianMixture")
742  {
743  throw ExceptionInvalidArgument(StringUTF8("void UnivariateGaussianMixture::Deserialize(xml::Element &el): ") +
744  _("Wrong XML element."));
745  }
746 
747  std::vector<std::pair<UnivariateGaussianPDF, double> > newmembers;
748 
749  for (xml::Element sub_el = el.BeginElement(); sub_el != el.EndElement(); ++sub_el)
750  {
751  double w = sub_el.GetAttribute<double>("weight", false); // may throw
752 
753  UnivariateGaussianPDF pdf = UnivariateGaussianPDF(sub_el); // may throw
754 
755  newmembers.push_back(std::make_pair(pdf, w));
756  }
757 
758  members.swap(newmembers);
759 }
760 
771 {
772  xml::Element el(parent.PushBackElement("UnivariateGaussianMixture"));
773 
774  for (auto & elem : members)
775  {
776  xml::Element sub_el(elem.first.Serialize(el));
777  sub_el.SetAttribute("weight", elem.second);
778  }
779 
780  return el;
781 }
782 
784  CRN_DATA_FACTORY_REGISTER(U"UnivariateGaussianMixture", UnivariateGaussianMixture)
785  Cloner::Register<UnivariateGaussianMixture>();
786 CRN_END_CLASS_CONSTRUCTOR(UnivariateGaussianMixture)
787 
bool operator()(const std::pair< UnivariateGaussianPDF, double > &e1, const std::pair< UnivariateGaussianPDF, double > &e2) const
size_t GetRows() const noexcept
Returns the number of rows.
Definition: CRNMatrix.h:157
Univariate Gaussian distribution.
crn::StringUTF8 Id
Definition: CRNAltoUtils.h:31
XML element.
Definition: CRNXml.h:135
double GetWeight(size_t k) const
Returns the weight of a given density function.
StringUTF8 GetName() const
Gets the label of the element.
Definition: CRNXml.h:146
#define _(String)
Definition: CRNi18n.h:51
String ToString() const
Dumps a summary of the mixture to a string.
bool operator()(const std::pair< UnivariateGaussianPDF, double > &e1, const std::pair< UnivariateGaussianPDF, double > &e2) const
void AddMember(UnivariateGaussianPDF pdf, double Weight)
Adds a density function.
void SetMember(UnivariateGaussianPDF pdf, double w, size_t k)
Replaces a given density function and its weight.
std::vector< double > MakeRandomSample(size_t n=1, size_t m=100, bool reseed=true) const
Creates a data sample following the PDF's probability law.
UnivariateGaussianPDF GetMember(size_t k) const
Returns a given density function.
double ValueAt(const double x) const
Evaluates a pattern.
Element BeginElement()
Gets the first child element.
Definition: CRNXml.h:174
#define CRN_END_CLASS_CONSTRUCTOR(classname)
Defines a class constructor.
Definition: CRNObject.h:198
bool operator()(const std::pair< UnivariateGaussianPDF, double > &e1, const std::pair< UnivariateGaussianPDF, double > &e2) const
A UTF32 character string class.
Definition: CRNString.h:61
virtual ~UnivariateGaussianMixture() override
Destructor.
A generic domain error.
Definition: CRNException.h:83
T GetMin() const
Definition: CRNMatrix.h:432
size_t GetNbMembers() const noexcept
Returns the number of density functions.
unsigned int EM(const MatrixDouble &patterns, size_t nbSeeds=2, double epsilon=std::numeric_limits< double >::epsilon(), size_t maximalIterations=100)
Expectation Maximization.
void MultRow(size_t r, double v)
Scale one row from matrix.
Definition: CRNMatrix.h:320
#define CRN_DATA_FACTORY_REGISTER(elemname, classname)
Registers a class to the data factory.
constexpr SumType< T > Sqr(const T &v) noexcept(noexcept(v *v))
Returns the square of a value.
Definition: CRNMath.h:61
void SortMembersByWeights(bool reverse=false)
Sort members by weights.
double GetMean() const noexcept
Returns the mean of a given density function.
void SetAttribute(const StringUTF8 &name, const StringUTF8 &value)
Sets the value of an attribute.
Definition: CRNXml.cpp:595
double matrix class
const T & At(size_t pos) const noexcept
Definition: CRNMatrix.h:165
double GetVariance(size_t k) const
Returns the variance of a given density function.
T GetMax() const
Definition: CRNMatrix.h:438
A character string class.
Definition: CRNStringUTF8.h:49
std::vector< double > MakeRandomSample(size_t n=1, size_t m=100, bool reseed=true) const
Creates a data sample following the mixture's probability law.
void SortMembersByMeans(bool reverse=false)
Sort members by means.
xml::Element Serialize(xml::Element &parent) const
Element PushBackElement(const StringUTF8 &name)
Adds an element at the end of the children list.
Definition: CRNXml.cpp:355
void SortMembersByVariances(bool reverse=false)
Sort members by means.
Element EndElement()
Gets a null node.
Definition: CRNXml.h:176
double MLLE(const MatrixDouble &Data) const
Maximum log-likelihood estimation.
Invalid argument error (e.g.: nullptr pointer)
Definition: CRNException.h:107
double GetMean(size_t k) const
Returns the mean of a given density function.
#define CRN_BEGIN_CLASS_CONSTRUCTOR(classname)
Defines a class constructor.
Definition: CRNObject.h:185