import org.apache.xerces.parsers.DOMParser;
import org.apache.xerces.xni.parser.XMLDocumentFilter;
import org.cyberneko.html.HTMLConfiguration;
import org.cyberneko.html.filters.*;
URL url = new URL(urlStr);
HttpURLConnection hcon = (HttpURLConnection)url.openConnection();
hcon.setConnectTimeout(cto);
hcon.setReadTimeout(rto);
hcon.setUseCaches(false);
hcon.setRequestProperty(”User-Agent”,userAgent);
hcon.setRequestProperty(”Accept-Charset”,”utf-8″);
hcon.setRequestProperty(”Keep-Alive”,”300″);
InputStream instr = hcon.getInputStream();
HTMLConfiguration config = new HTMLConfiguration();
XMLDocumentFilter[] filters = { new Purifier() };
config.setProperty(”http://cyberneko.org/html/properties/filters”,filters);
DOMParser dp = new DOMParser(config);
dp.setFeature(”http://apache.org/xml/features/dom/include-ignorable-whitespace”,false);
dp.parse(new org.xml.sax.InputSource(instr));
return dp.getDocument();