//: C03:HTMLStripper.cpp {RunByHand}
// From "Thinking in C++, Volume 2", by Bruce Eckel & Chuck Allison.
// (c) 1995-2004 MindView, Inc. All Rights Reserved.
// See source code use permissions stated in the file 'License.txt',
// distributed with the code package available at www.MindView.net.
//{L} ReplaceAll
// Filter to remove html tags and markers.
#include
#include
#include
#include
#include
#include
#include "ReplaceAll.h"
#include "../require.h"
using namespace std;
string& stripHTMLTags(string& s) {
static bool inTag = false;
bool done = false;
while(!done) {
if(inTag) {
// The previous line started an HTML tag
// but didn't finish. Must search for '>'.
size_t rightPos = s.find('>');
if(rightPos != string::npos) {
inTag = false;
s.erase(0, rightPos + 1);
}
else {
done = true;
s.erase();
}
}
else {
// Look for start of tag:
size_t leftPos = s.find('<');
if(leftPos != string::npos) {
// See if tag close is in this line:
size_t rightPos = s.find('>');
if(rightPos == string::npos) {
inTag = done = true;
s.erase(leftPos);
}
else
s.erase(leftPos, rightPos - leftPos + 1);
}
else
done = true;
}
}
// Remove all special HTML characters
replaceAll(s, "<", "<");
replaceAll(s, ">", ">");
replaceAll(s, "&", "&");
replaceAll(s, " ", " ");
// Etc...
return s;
}
int main(int argc, char* argv[]) {
requireArgs(argc, 1,
"usage: HTMLStripper InputFile");
ifstream in(argv[1]);
assure(in, argv[1]);
string s;
while(getline(in, s))
if(!stripHTMLTags(s).empty())
cout << s << endl;
} ///:~