<?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml" lang="en_US" xml:lang="en_US"> <head> <title>akregator/src/librss: feeddetector.cpp Source File (akregator/src/librss)</title> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> <meta http-equiv="Content-Style-Type" content="text/css" /> <meta http-equiv="pics-label" content='(pics-1.1 "http://www.icra.org/ratingsv02.html" comment "ICRAonline DE v2.0" l gen true for "http://www.kde.org" r (nz 1 vz 1 lz 1 oz 1 cb 1) "http://www.rsac.org/ratingsv01.html" l gen true for "http://www.kde.org" r (n 0 s 0 v 0 l 0))' /> <meta name="trademark" content="KDE e.V." /> <meta name="description" content="K Desktop Environment Homepage, KDE.org" /> <meta name="MSSmartTagsPreventParsing" content="true" /> <meta name="robots" content="all" /> <link rel="shortcut icon" href="../../../../favicon.ico" /> <link rel="stylesheet" media="screen" type="text/css" title="APIDOX" href="doxygen.css" /> <style type="text/css"> <!-- hr { display: none; } #content h2 { margin-left: 0px; } table.mdTable { background-color: #f8f8f8; border: .2em solid #d7d7d7; } td.mdRow { padding: 8px 20px; } td.md { font-weight: bold; } td.mdname1 { font-weight: bold; color: #602020; } td.mdname { font-weight: bold; color: #602020; } --> </style> </head> <body> <div id="nav_header_top" align="right"> <a href="#content" class="doNotDisplay" accesskey="2">Skip to main content ::</a> <a href="../../../.."><img id="nav_header_logo" alt="Home" align="left" src="../../../../kde_gear_64.png" border="0" /></a> <span class="doNotDisplay">::</span> <img id="nav_header_logo_right" alt="" align="right" src="../../../../pimlogo.png" border="0" /> <div id="nav_header_title" align="left">KDE PIM API Reference</div> </div> <div id="nav_header_bottom" align="right"> <span class="doNotDisplay">:: <a href="#navigation" accesskey="5">Skip to Link Menu</a><br/></span> <div id="nav_header_bottom_left" style="text-align: left;"> / <a href="../../../../">API Reference</a> / <a href="../../../html/index.html">akregator</a> / <a href="../../html/index.html">src</a> / <a href=".">librss</a> </div> </div> <table id="main" border="0" cellpadding="0" cellspacing="0" width="100%"> <tr> <td valign="top" class="menuheader" height="0"></td> <td id="contentcolumn" valign="top" rowspan="2" > <div id="content" style="padding-top: 0px;"><div style="width:100%; margin: 0px; padding: 0px;"> <h2><a name="content"></a>akregator/src/librss</h2> <!-- Generated by Doxygen 1.5.5 --> <h1>feeddetector.cpp</h1><div class="fragment"><pre class="fragment"><a name="l00001"></a>00001 <span class="comment">/*</span> <a name="l00002"></a>00002 <span class="comment"> This file is part of Akregator.</span> <a name="l00003"></a>00003 <span class="comment"></span> <a name="l00004"></a>00004 <span class="comment"> Copyright (C) 2004 Teemu Rytilahti <tpr@d5k.net></span> <a name="l00005"></a>00005 <span class="comment"></span> <a name="l00006"></a>00006 <span class="comment"> This program is free software; you can redistribute it and/or modify</span> <a name="l00007"></a>00007 <span class="comment"> it under the terms of the GNU General Public License as published by</span> <a name="l00008"></a>00008 <span class="comment"> the Free Software Foundation; either version 2 of the License, or</span> <a name="l00009"></a>00009 <span class="comment"> (at your option) any later version.</span> <a name="l00010"></a>00010 <span class="comment"></span> <a name="l00011"></a>00011 <span class="comment"> This program is distributed in the hope that it will be useful,</span> <a name="l00012"></a>00012 <span class="comment"> but WITHOUT ANY WARRANTY; without even the implied warranty of</span> <a name="l00013"></a>00013 <span class="comment"> MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the</span> <a name="l00014"></a>00014 <span class="comment"> GNU General Public License for more details.</span> <a name="l00015"></a>00015 <span class="comment"></span> <a name="l00016"></a>00016 <span class="comment"> You should have received a copy of the GNU General Public License</span> <a name="l00017"></a>00017 <span class="comment"> along with this program; if not, write to the Free Software</span> <a name="l00018"></a>00018 <span class="comment"> Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.</span> <a name="l00019"></a>00019 <span class="comment"></span> <a name="l00020"></a>00020 <span class="comment"> As a special exception, permission is given to link this program</span> <a name="l00021"></a>00021 <span class="comment"> with any edition of Qt, and distribute the resulting executable,</span> <a name="l00022"></a>00022 <span class="comment"> without including the source code for Qt in the source distribution.</span> <a name="l00023"></a>00023 <span class="comment">*/</span> <a name="l00024"></a>00024 <a name="l00025"></a>00025 <span class="preprocessor">#include <qregexp.h></span> <a name="l00026"></a>00026 <span class="preprocessor">#include <qstring.h></span> <a name="l00027"></a>00027 <span class="preprocessor">#include <qstringlist.h></span> <a name="l00028"></a>00028 <span class="preprocessor">#include <qvaluelist.h></span> <a name="l00029"></a>00029 <span class="preprocessor">#include <kcharsets.h></span> <a name="l00030"></a>00030 <span class="preprocessor">#include <kurl.h></span> <a name="l00031"></a>00031 <a name="l00032"></a>00032 <span class="preprocessor">#include "feeddetector.h"</span> <a name="l00033"></a>00033 <a name="l00034"></a>00034 <a name="l00035"></a>00035 <span class="keyword">using namespace </span>RSS; <a name="l00036"></a>00036 <a name="l00037"></a><a class="code" href="classRSS_1_1FeedDetector.html#24417ee5c9a1fb9abf2ee3dc483f28ac">00037</a> FeedDetectorEntryList FeedDetector::extractFromLinkTags(<span class="keyword">const</span> QString& s) <a name="l00038"></a>00038 { <a name="l00039"></a>00039 <span class="comment">//reduce all sequences of spaces, newlines etc. to one space:</span> <a name="l00040"></a>00040 QString str = s.simplifyWhiteSpace(); <a name="l00041"></a>00041 <a name="l00042"></a>00042 <span class="comment">// extracts <link> tags</span> <a name="l00043"></a>00043 QRegExp reLinkTag(<span class="stringliteral">"<[\\s]?LINK[^>]*REL[\\s]?=[\\s]?\\\"[\\s]?(ALTERNATE|SERVICE\\.FEED)[\\s]?\\\"[^>]*>"</span>, <span class="keyword">false</span>); <a name="l00044"></a>00044 <a name="l00045"></a>00045 <span class="comment">// extracts the URL (href="url")</span> <a name="l00046"></a>00046 QRegExp reHref(<span class="stringliteral">"HREF[\\s]?=[\\s]?\\\"([^\\\"]*)\\\""</span>, <span class="keyword">false</span>); <a name="l00047"></a>00047 <span class="comment">// extracts type attribute</span> <a name="l00048"></a>00048 QRegExp reType(<span class="stringliteral">"TYPE[\\s]?=[\\s]?\\\"([^\\\"]*)\\\""</span>, <span class="keyword">false</span>); <a name="l00049"></a>00049 <span class="comment">// extracts the title (title="title")</span> <a name="l00050"></a>00050 QRegExp reTitle(<span class="stringliteral">"TITLE[\\s]?=[\\s]?\\\"([^\\\"]*)\\\""</span>, <span class="keyword">false</span>); <a name="l00051"></a>00051 <a name="l00052"></a>00052 <span class="keywordtype">int</span> pos = 0; <a name="l00053"></a>00053 <span class="keywordtype">int</span> matchpos = 0; <a name="l00054"></a>00054 <a name="l00055"></a>00055 <span class="comment">// get all <link> tags</span> <a name="l00056"></a>00056 QStringList linkTags; <a name="l00057"></a>00057 <span class="comment">//int strlength = str.length();</span> <a name="l00058"></a>00058 <span class="keywordflow">while</span> ( matchpos != -1 ) <a name="l00059"></a>00059 { <a name="l00060"></a>00060 matchpos = reLinkTag.search(str, pos); <a name="l00061"></a>00061 <span class="keywordflow">if</span> (matchpos != -1) <a name="l00062"></a>00062 { <a name="l00063"></a>00063 linkTags.append( str.mid(matchpos, reLinkTag.matchedLength()) ); <a name="l00064"></a>00064 pos = matchpos + reLinkTag.matchedLength(); <a name="l00065"></a>00065 } <a name="l00066"></a>00066 } <a name="l00067"></a>00067 <a name="l00068"></a>00068 FeedDetectorEntryList list; <a name="l00069"></a>00069 <a name="l00070"></a>00070 <span class="keywordflow">for</span> ( QStringList::Iterator it = linkTags.begin(); it != linkTags.end(); ++it ) <a name="l00071"></a>00071 { <a name="l00072"></a>00072 QString type; <a name="l00073"></a>00073 <span class="keywordtype">int</span> pos = reType.search(*it, 0); <a name="l00074"></a>00074 <span class="keywordflow">if</span> (pos != -1) <a name="l00075"></a>00075 type = reType.cap(1).lower(); <a name="l00076"></a>00076 <a name="l00077"></a>00077 <span class="comment">// we accept only type attributes indicating a feed</span> <a name="l00078"></a>00078 <span class="keywordflow">if</span> ( type != <span class="stringliteral">"application/rss+xml"</span> && type != <span class="stringliteral">"application/rdf+xml"</span> <a name="l00079"></a>00079 && type != <span class="stringliteral">"application/atom+xml"</span> && type != <span class="stringliteral">"text/xml"</span> ) <a name="l00080"></a>00080 <span class="keywordflow">continue</span>; <a name="l00081"></a>00081 <a name="l00082"></a>00082 QString title; <a name="l00083"></a>00083 pos = reTitle.search(*it, 0); <a name="l00084"></a>00084 <span class="keywordflow">if</span> (pos != -1) <a name="l00085"></a>00085 title = reTitle.cap(1); <a name="l00086"></a>00086 <a name="l00087"></a>00087 title = KCharsets::resolveEntities(title); <a name="l00088"></a>00088 <a name="l00089"></a>00089 QString url; <a name="l00090"></a>00090 pos = reHref.search(*it, 0); <a name="l00091"></a>00091 <span class="keywordflow">if</span> (pos != -1) <a name="l00092"></a>00092 url = reHref.cap(1); <a name="l00093"></a>00093 <a name="l00094"></a>00094 url = KCharsets::resolveEntities(url); <a name="l00095"></a>00095 <a name="l00096"></a>00096 <span class="comment">// if feed has no title, use the url as preliminary title (until feed is parsed)</span> <a name="l00097"></a>00097 <span class="keywordflow">if</span> ( title.isEmpty() ) <a name="l00098"></a>00098 title = url; <a name="l00099"></a>00099 <a name="l00100"></a>00100 <span class="keywordflow">if</span> ( !url.isEmpty() ) <a name="l00101"></a>00101 list.append(FeedDetectorEntry(url, title) ); <a name="l00102"></a>00102 } <a name="l00103"></a>00103 <a name="l00104"></a>00104 <a name="l00105"></a>00105 <span class="keywordflow">return</span> list; <a name="l00106"></a>00106 } <a name="l00107"></a>00107 <a name="l00108"></a><a class="code" href="classRSS_1_1FeedDetector.html#66bc93d78f93d0b6f24947ea5441bb7b">00108</a> QStringList <a class="code" href="classRSS_1_1FeedDetector.html#66bc93d78f93d0b6f24947ea5441bb7b" title="searches an HTML page for slightly feed-like looking links and catches everything...">FeedDetector::extractBruteForce</a>(<span class="keyword">const</span> QString& s) <a name="l00109"></a>00109 { <a name="l00110"></a>00110 QString str = s.simplifyWhiteSpace(); <a name="l00111"></a>00111 <a name="l00112"></a>00112 QRegExp reAhrefTag(<span class="stringliteral">"<[\\s]?A[^>]?HREF=[\\s]?\\\"[^\\\"]*\\\"[^>]*>"</span>, <span class="keyword">false</span>); <a name="l00113"></a>00113 <a name="l00114"></a>00114 <span class="comment">// extracts the URL (href="url")</span> <a name="l00115"></a>00115 QRegExp reHref(<span class="stringliteral">"HREF[\\s]?=[\\s]?\\\"([^\\\"]*)\\\""</span>, <span class="keyword">false</span>); <a name="l00116"></a>00116 <a name="l00117"></a>00117 QRegExp rssrdfxml(<span class="stringliteral">".*(RSS|RDF|XML)"</span>, <span class="keyword">false</span>); <a name="l00118"></a>00118 <a name="l00119"></a>00119 <span class="keywordtype">int</span> pos = 0; <a name="l00120"></a>00120 <span class="keywordtype">int</span> matchpos = 0; <a name="l00121"></a>00121 <a name="l00122"></a>00122 <span class="comment">// get all <a href> tags and capture url</span> <a name="l00123"></a>00123 QStringList list; <a name="l00124"></a>00124 <span class="comment">//int strlength = str.length();</span> <a name="l00125"></a>00125 <span class="keywordflow">while</span> ( matchpos != -1 ) <a name="l00126"></a>00126 { <a name="l00127"></a>00127 matchpos = reAhrefTag.search(str, pos); <a name="l00128"></a>00128 <span class="keywordflow">if</span> ( matchpos != -1 ) <a name="l00129"></a>00129 { <a name="l00130"></a>00130 QString ahref = str.mid(matchpos, reAhrefTag.matchedLength()); <a name="l00131"></a>00131 <span class="keywordtype">int</span> hrefpos = reHref.search(ahref, 0); <a name="l00132"></a>00132 <span class="keywordflow">if</span> ( hrefpos != -1 ) <a name="l00133"></a>00133 { <a name="l00134"></a>00134 QString url = reHref.cap(1); <a name="l00135"></a>00135 <a name="l00136"></a>00136 url = KCharsets::resolveEntities(url); <a name="l00137"></a>00137 <a name="l00138"></a>00138 <span class="keywordflow">if</span> ( rssrdfxml.exactMatch(url) ) <a name="l00139"></a>00139 list.append(url); <a name="l00140"></a>00140 } <a name="l00141"></a>00141 <a name="l00142"></a>00142 pos = matchpos + reAhrefTag.matchedLength(); <a name="l00143"></a>00143 } <a name="l00144"></a>00144 } <a name="l00145"></a>00145 <a name="l00146"></a>00146 <span class="keywordflow">return</span> list; <a name="l00147"></a>00147 } <a name="l00148"></a>00148 <a name="l00149"></a>00149 QString FeedDetector::fixRelativeURL(<span class="keyword">const</span> QString &s, <span class="keyword">const</span> KURL &baseurl) <a name="l00150"></a>00150 { <a name="l00151"></a>00151 QString s2=s; <a name="l00152"></a>00152 KURL u; <a name="l00153"></a>00153 <span class="keywordflow">if</span> (KURL::isRelativeURL(s2)) <a name="l00154"></a>00154 { <a name="l00155"></a>00155 <span class="keywordflow">if</span> (s2.startsWith(<span class="stringliteral">"//"</span>)) <a name="l00156"></a>00156 { <a name="l00157"></a>00157 s2=s2.prepend(baseurl.protocol()+<span class="stringliteral">":"</span>); <a name="l00158"></a>00158 u=s2; <a name="l00159"></a>00159 } <a name="l00160"></a>00160 <span class="keywordflow">else</span> <span class="keywordflow">if</span> (s2.startsWith(<span class="stringliteral">"/"</span>)) <a name="l00161"></a>00161 { <a name="l00162"></a>00162 KURL b2(baseurl); <a name="l00163"></a>00163 b2.setPath(QString()); <span class="comment">// delete path and query, so that only protocol://host remains</span> <a name="l00164"></a>00164 b2.setQuery(QString()); <a name="l00165"></a>00165 u = KURL(b2, s2.remove(0,1)); <span class="comment">// remove leading "/" </span> <a name="l00166"></a>00166 } <a name="l00167"></a>00167 <span class="keywordflow">else</span> <a name="l00168"></a>00168 { <a name="l00169"></a>00169 u = KURL(baseurl, s2); <a name="l00170"></a>00170 } <a name="l00171"></a>00171 } <a name="l00172"></a>00172 <span class="keywordflow">else</span> <a name="l00173"></a>00173 u=s2; <a name="l00174"></a>00174 <a name="l00175"></a>00175 u.cleanPath(); <a name="l00176"></a>00176 <span class="comment">//kdDebug() << "AKREGATOR_PLUGIN_FIXURL: " << "url=" << s << " baseurl=" << baseurl.url() << " fixed=" << u.url() << </span> <a name="l00177"></a>00177 <span class="comment">//endl;</span> <a name="l00178"></a>00178 <span class="keywordflow">return</span> u.url(); <a name="l00179"></a>00179 } </pre></div></div> </div></div> </td> </tr> <tr> <td valign="top" id="leftmenu" width="25%"> <a name="navigation"></a> <div class="menu_box"><h2>akregator/src/librss</h2> <div class="nav_list"> <ul><li><a href="index.html">Main Page</a></li><li><a href="hierarchy.html">Class Hierarchy</a></li><li><a href="classes.html">Alphabetical List</a></li><li><a href="annotated.html">Class List</a></li><li><a href="files.html">File List</a></li><li><a href="functions.html">Class Members</a></li></ul> <!-- <h2>Class Picker</h2> <div style="text-align: center;"> <form name="guideform"> <select name="guidelinks" style="width:100%;" onChange="window.location=document.guideform.guidelinks.options[document.guideform.guidelinks.selectedIndex].value"> <option value="annotated.html">-- Choose --</option> <option value="classRSS_1_1Article.html">rss::article</option>, <option value="classRSS_1_1DataRetriever.html">rss::dataretriever</option>, <option value="classRSS_1_1Document.html">rss::document</option>, <option value="classRSS_1_1FeedDetector.html">rss::feeddetector</option>, <option value="classRSS_1_1FileRetriever.html">rss::fileretriever</option>, <option value="classRSS_1_1Image.html">rss::image</option>, <option value="classRSS_1_1Loader.html">rss::loader</option>, <option value="classRSS_1_1OutputRetriever.html">rss::outputretriever</option>, <option value="classRSS_1_1TextInput.html">rss::textinput</option>, </select> </form> </div> --> </div></div> <div class="menu_box"><h2>API Dox</h2> <div class="nav_list"> <ul> <li><a href="../../../../akregator/html/index.html">akregator</a></li><li> <a href="../../../../akregator/src/html/index.html">src</a></li><li> <a href="../../../../akregator/src/librss/html/index.html">librss</a></li><li><a href="../../../../certmanager/html/index.html">certmanager</a></li><li> <a href="../../../../certmanager/lib/html/index.html">lib</a></li><li><a href="../../../../kaddressbook/html/index.html">kaddressbook</a></li><li><a href="../../../../kalarm/html/index.html">kalarm</a></li><li> <a href="../../../../kalarm/lib/html/index.html">lib</a></li><li><a href="../../../../kandy/html/index.html">kandy</a></li><li><a href="../../../../karm/html/index.html">karm</a></li><li><a href="../../../../kdgantt/html/index.html">kdgantt</a></li><li><a href="../../../../kgantt/html/index.html">kgantt</a></li><li><a href="../../../../kioslaves/html/index.html">kioslaves</a></li><li> <a href="../../../../kioslaves/imap4/html/index.html">imap4</a></li><li> <a href="../../../../kioslaves/mbox/html/index.html">mbox</a></li><li><a href="../../../../kitchensync/html/index.html">kitchensync</a></li><li><a href="../../../../kmail/html/index.html">kmail</a></li><li><a href="../../../../knotes/html/index.html">knotes</a></li><li><a href="../../../../konsolekalendar/html/index.html">konsolekalendar</a></li><li><a href="../../../../kontact/html/index.html">kontact</a></li><li><a href="../../../../korganizer/html/index.html">korganizer</a></li><li><a href="../../../../kpilot/html/index.html">kpilot</a></li><li> <a href="../../../../kpilot/kpilot/html/index.html">kpilot</a></li><li> <a href="../../../../kpilot/lib/html/index.html">lib</a></li><li><a href="../../../../libemailfunctions/html/index.html">libemailfunctions</a></li><li><a href="../../../../libkcal/html/index.html">libkcal</a></li><li><a href="../../../../libkdepim/html/index.html">libkdepim</a></li><li><a href="../../../../libkholidays/html/index.html">libkholidays</a></li><li><a href="../../../../libkmime/html/index.html">libkmime</a></li><li><a href="../../../../libkpgp/html/index.html">libkpgp</a></li><li><a href="../../../../libkpimidentities/html/index.html">libkpimidentities</a></li> </ul></div></div> </td> </tr> </table> <span class="doNotDisplay"> <a href="http://www.kde.org/" accesskey="8">KDE Home</a> | <a href="http://accessibility.kde.org/" accesskey="9">KDE Accessibility Home</a> | <a href="http://www.kde.org/media/accesskeys.php" accesskey="0">Description of Access Keys</a> </span> <div style="height: 8px"></div> <div id="footer"> <div id="footer_left"> Maintained by <a href="mailto:groot@kde.org">Adriaan de Groot</a> and <a href="mailto:winter@kde.org">Allen Winter</a>. <br/> KDE and K Desktop Environment are trademarks of <a href="http://www.kde.org/areas/kde-ev/" title="Homepage of the KDE non-profit Organization">KDE e.V.</a> | <a href="http://www.kde.org/contact/impressum.php">Legal</a> </div> <div id="footer_right"><img src="/media/images/footer_right.png" style="margin: 0px" alt="" /></div> </div> <!-- WARNING: DO NOT SEND MAIL TO THE FOLLOWING EMAIL ADDRESS! YOU WILL BE BLOCKED INSTANTLY AND PERMANENTLY! <a href="mailto:aaaatrap-425acc3b5374943f@kde.org">Block me</a> WARNING END --> </body> </html>