//: C03:HTMLStripper.cpp {RunByHand} // From "Thinking in C++, Volume 2", by Bruce Eckel & Chuck Allison. // (c) 1995-2004 MindView, Inc. All Rights Reserved. // See source code use permissions stated in the file 'License.txt', // distributed with the code package available at www.MindView.net. //{L} ReplaceAll // Filter to remove html tags and markers. #include #include #include #include #include #include #include "ReplaceAll.h" #include "../require.h" using namespace std; string& stripHTMLTags(string& s) { static bool inTag = false; bool done = false; while(!done) { if(inTag) { // The previous line started an HTML tag // but didn't finish. Must search for '>'. size_t rightPos = s.find('>'); if(rightPos != string::npos) { inTag = false; s.erase(0, rightPos + 1); } else { done = true; s.erase(); } } else { // Look for start of tag: size_t leftPos = s.find('<'); if(leftPos != string::npos) { // See if tag close is in this line: size_t rightPos = s.find('>'); if(rightPos == string::npos) { inTag = done = true; s.erase(leftPos); } else s.erase(leftPos, rightPos - leftPos + 1); } else done = true; } } // Remove all special HTML characters replaceAll(s, "<", "<"); replaceAll(s, ">", ">"); replaceAll(s, "&", "&"); replaceAll(s, " ", " "); // Etc... return s; } int main(int argc, char* argv[]) { requireArgs(argc, 1, "usage: HTMLStripper InputFile"); ifstream in(argv[1]); assure(in, argv[1]); string s; while(getline(in, s)) if(!stripHTMLTags(s).empty()) cout << s << endl; } ///:~