libcrn  3.9.5
A document image processing library
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
CRNTextFile.cpp
Go to the documentation of this file.
1 /* Copyright 2006-2014 Yann LEYDIER, CoReNum, INSA-Lyon
2  *
3  * This file is part of libcrn.
4  *
5  * libcrn is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser General Public License as published by
7  * the Free Software Foundation, either version 3 of the License, or
8  * (at your option) any later version.
9  *
10  * libcrn is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU Lesser General Public License for more details.
14  *
15  * You should have received a copy of the GNU Lesser General Public License
16  * along with libcrn. If not, see <http://www.gnu.org/licenses/>.
17  *
18  * file: CRNTextFile.cpp
19  * \author Yann LEYDIER
20  */
21 
22 #include <CRNi18n.h>
23 #include <CRNIO/CRNTextFile.h>
24 #include <fstream>
25 #include <CRNData/CRNForeach.h>
26 #include <CRNException.h>
27 
28 using namespace crn;
29 
30 /*****************************************************************************/
36  original_encoding(enc)
37 {
38 }
39 
40 /*****************************************************************************/
48 TextFile::TextFile(const Path &fname, Encoding enc):
49  filename(fname),
50  original_encoding(enc)
51 {
52  std::ifstream in;
53  in.open(fname.CStr());
54  if (!in.is_open())
55  throw ExceptionIO(StringUTF8("TextFile::TextFile(const Path &fname, Encoding enc): ") + _("Cannot open file ") + StringUTF8(fname));
56  std::filebuf *pbuf = in.rdbuf();
57  size_t size = size_t(pbuf->pubseekoff(0, std::ios::end, std::ios::in));
58  pbuf->pubseekpos(0, std::ios::in);
59  if (enc == Encoding::UTF_32)
60  {
61  size_t wsize = size / sizeof(char32_t);
62  std::vector<char32_t> buffer(wsize + 1);
63  pbuf->sgetn((char*)&(buffer.front()), size);
64  buffer[wsize] = U'\0'; // ensure that the string as a terminal 0
65  text = &(buffer.front());
66  }
67  else
68  {
69  std::vector<char> buffer(size + 1);
70  pbuf->sgetn(&(buffer.front()), size);
71  buffer[size] = '\0'; // ensure that the string as a terminal 0
72  text = &(buffer.front());
73  }
74  in.close();
75 }
76 
77 /*****************************************************************************/
85 void TextFile::Save(const Path &fname)
86 {
87  std::ofstream out;
88  out.open(fname.CStr());
89  if (!out.is_open())
90  {
91  throw ExceptionIO(StringUTF8("void TextFile::Save(const Path &fname): ") + _("cannot open file ") + StringUTF8(fname));
92  }
93  if (original_encoding == Encoding::UTF_32)
94  out.write((char*)text.CWStr(), text.Size() * sizeof(char32_t));
95  else
96  out << text.CStr();
97  out.close();
98  filename = fname;
99 }
100 
101 /*****************************************************************************/
110 void TextFile::Save(const Path &fname, Encoding enc)
111 {
112  std::ofstream out;
113  out.open(fname.CStr());
114  if (!out.is_open())
115  {
116  throw ExceptionIO(StringUTF8("void TextFile::Save(const Path &fname, Encoding enc): ") + _("cannot open file ") + StringUTF8(fname));
117  }
118  original_encoding = enc;
119  if (enc == Encoding::UTF_32)
120  out.write((char*)text.CWStr(), text.Size() * sizeof(char32_t));
121  else
122  out << text.CStr();
123  out.close();
124  filename = fname;
125 }
126 
127 /*****************************************************************************/
134 {
135  std::ofstream out;
136  out.open(filename.CStr());
137  if (!out.is_open())
138  {
139  throw ExceptionIO(StringUTF8("void TextFile::Save(): ") + _("cannot open file ") + StringUTF8(filename));
140  }
141  if (original_encoding == Encoding::UTF_32)
142  out.write((char*)text.CWStr(), text.Size() * sizeof(char32_t));
143  else
144  out << text.CStr();
145  out.close();
146 }
147 
148 
149 /*****************************************************************************/
156 std::set<String> TextFile::ExtractWords(bool case_sensitive) const
157 {
158  std::set<String> words;
159 
160  std::vector<String> stxt(SplitText());
161  for (String &s : stxt)
162  {
163  if (!case_sensitive)
164  {
165  s.ToLower();
166  }
167  words.insert(s);
168  }
169  return words;
170 }
171 
172 /*****************************************************************************/
181 std::map<int, String> TextFile::ExtractWordsByFrequency(bool case_sensitive) const
182 {
183  std::map<String, int> occs;
184 
185  std::vector<String> stxt(SplitText());
186  for (String &s : stxt)
187  {
188  if (!case_sensitive)
189  {
190  s.ToLower();
191  }
192  occs[s] += 1;
193  }
194  std::map<int, String> words;
195  for (std::pair<const String, int>&p : occs)
196  {
197  words[p.second] = p.first;
198  }
199  return words;
200 }
201 
size_t Size() const noexcept
Returns the length of the string.
Definition: CRNString.h:160
#define _(String)
Definition: CRNi18n.h:51
const char * CStr() const
Conversion to UTF8 cstring.
Definition: CRNString.cpp:167
Encoding
Allowed text encoding.
Definition: CRNTextFile.h:47
void Save()
Overwrites the loaded file.
A UTF32 character string class.
Definition: CRNString.h:61
TextFile(Encoding enc=Encoding::UTF_8)
Blank constructor.
Definition: CRNTextFile.cpp:35
const char * CStr() const noexcept
Conversion to UTF8 cstring.
A convenience class for file paths.
Definition: CRNPath.h:39
std::vector< String > SplitText() const
Extract words.
Definition: CRNTextFile.h:72
const char32_t * CWStr() const noexcept
Conversion to wide cstring.
Definition: CRNString.h:128
std::set< String > ExtractWords(bool case_sensitive=false) const
Extract unique words.
A character string class.
Definition: CRNStringUTF8.h:49
I/O error.
Definition: CRNException.h:179
std::map< int, String > ExtractWordsByFrequency(bool case_sensitive=false) const
Extract unique words sorted by frequency.