1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
|
/***************************************************************************
* Copyright (C) 2008-2021 by Andrzej Rybczak *
* andrzej@rybczak.net *
* *
* This program is free software; you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation; either version 2 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program; if not, write to the *
* Free Software Foundation, Inc., *
* 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. *
***************************************************************************/
#include <algorithm>
#include <boost/algorithm/string/replace.hpp>
#include "utility/html.h"
std::string unescapeHtmlUtf8(const std::string &data)
{
std::string result;
for (size_t i = 0, j; i < data.length(); ++i)
{
if (data[i] == '&' && data[i+1] == '#' && (j = data.find(';', i)) != std::string::npos)
{
int n = atoi(&data.c_str()[i+2]);
if (n >= 0x800)
{
result += (0xe0 | ((n >> 12) & 0x0f));
result += (0x80 | ((n >> 6) & 0x3f));
result += (0x80 | (n & 0x3f));
}
else if (n >= 0x80)
{
result += (0xc0 | ((n >> 6) & 0x1f));
result += (0x80 | (n & 0x3f));
}
else
result += n;
i = j;
}
else
result += data[i];
}
return result;
}
void unescapeHtmlEntities(std::string &s)
{
// well, at least some of them.
boost::replace_all(s, "&", "&");
boost::replace_all(s, ">", ">");
boost::replace_all(s, "<", "<");
boost::replace_all(s, " ", " ");
boost::replace_all(s, """, "\"");
boost::replace_all(s, "–", "–");
boost::replace_all(s, "—", "—");
}
void stripHtmlTags(std::string &s)
{
// Erase newlines so they don't duplicate with HTML ones.
s.erase(std::remove_if(s.begin(), s.end(), [](char c) {
return c == '\n' || c == '\r';
}), s.end());
bool is_newline;
for (size_t i = s.find("<"); i != std::string::npos; i = s.find("<"))
{
size_t j = s.find(">", i);
if (j != std::string::npos)
{
++j;
is_newline
= s.compare(i, std::min<size_t>(3, j-i), "<p ") == 0
|| s.compare(i, j-i, "<p>") == 0
|| s.compare(i, j-i, "</p>") == 0
|| s.compare(i, j-i, "<br>") == 0
|| s.compare(i, j-i, "<br/>") == 0
|| s.compare(i, std::min<size_t>(4, j-i), "<br ") == 0;
if (is_newline)
s.replace(i, j-i, "\n");
else
s.replace(i, j-i, "");
}
else
break;
}
unescapeHtmlEntities(s);
}
|