//: C01:HTMLStripper.cpp
// From Thinking in C++, 2nd Edition
// Available at http://www.BruceEckel.com
// (c) Bruce Eckel 2000
// Copyright notice in Copyright.txt
// Filter to remove html tags and markers
#include "../require.h"
#include
#include
#include
using namespace std;
string replaceAll(string s, string f, string r) {
unsigned int found = s.find(f);
while(found != string::npos) {
s.replace(found, f.length(), r);
found = s.find(f);
}
return s;
}
string stripHTMLTags(string s) {
while(true) {
unsigned int left = s.find('<');
unsigned int right = s.find('>');
if(left==string::npos || right==string::npos)
break;
s = s.erase(left, right - left + 1);
}
s = replaceAll(s, "<", "<");
s = replaceAll(s, ">", ">");
s = replaceAll(s, "&", "&");
s = replaceAll(s, " ", " ");
// Etc...
return s;
}
int main(int argc, char* argv[]) {
requireArgs(argc, 1,
"usage: HTMLStripper InputFile");
ifstream in(argv[1]);
assure(in, argv[1]);
const int sz = 4096;
char buf[sz];
while(in.getline(buf, sz)) {
string s(buf);
cout << stripHTMLTags(s) << endl;
}
} ///:~