From 8627ab71322b7c6cff9b55fb2f6eb3b558772891 Mon Sep 17 00:00:00 2001 From: Mart Lubbers Date: Fri, 9 Jan 2015 19:16:08 +0100 Subject: [PATCH] upate --- thesis2/1.introduction.tex | 7 +++ thesis2/3.methods.tex | 39 ---------------- thesis2/4.discussion.tex | 95 +++++++++++++++++++++++++++++--------- thesis2/5.appendices.tex | 7 ++- thesis2/Makefile | 2 +- thesis2/notes.txt | 3 ++ 6 files changed, 89 insertions(+), 64 deletions(-) create mode 100644 thesis2/notes.txt diff --git a/thesis2/1.introduction.tex b/thesis2/1.introduction.tex index 8b02c08..a61eb8f 100644 --- a/thesis2/1.introduction.tex +++ b/thesis2/1.introduction.tex @@ -256,6 +256,13 @@ defined in a human readable text file. In practice it means that one person, not by definition a programmer, can be instructed to change the structure and this can also greatly reduce programmer intervention time. +The actual problem statement then becomes: +\begin{center} + \textit{Is it possible to shorten the feedback loop for repairing and adding + crawlers by making a system that can create, add and maintain crawlers for + RSS feeds} +\end{center} + \section{RSS/Atom} RSS/Atom feeds, from now on called RSS feeds, are publishing feeds in the XML format\cite{Xml} that are used to publish events. Every event or entry consists diff --git a/thesis2/3.methods.tex b/thesis2/3.methods.tex index 812cff0..b02e707 100644 --- a/thesis2/3.methods.tex +++ b/thesis2/3.methods.tex @@ -165,42 +165,3 @@ the DAWG is still optimal. n -> q0[label="SG0"]; } \end{figure} - -%\subsection{Process} -%Proposal was written -% -% -%First html/mail/fax/rss, worst case rss -% -% -%After some research and determining the scope of the project we decided only to -%do RSS, this because RSS tends to force structure in the data because RSS feeds -%are often generated by the website and thus reliable and consistent. We found a -%couple of good RSS feeds. -% -% -%At first the general framework was designed and implemented, no method yet. -% -% -%Started with method for recognizing separators. -% -% -%Found research paper about algorithm that can create directed acyclic graphs -%from string, although it was designed to compress word lists it can be -%(mis)used to extract information. -% -% -%Implementation of DAG algorithm found and tied to the program. -% -% -%Command line program ready. Conversation with both supervisors, gui had to be -%made. -% -%Step by step gui created. Web interface as a control center for the crawlers. -% -% -%Gui optimized. -% -% -%Concluded that the program doesn't reach wide audience due to lack of well -%structured rss feeds. diff --git a/thesis2/4.discussion.tex b/thesis2/4.discussion.tex index 43a5122..9fb2fcb 100644 --- a/thesis2/4.discussion.tex +++ b/thesis2/4.discussion.tex @@ -1,29 +1,78 @@ \section{Conclusion} +\begin{center} + \textit{Is it possible to shorten the feedback loop for repairing and adding + crawlers by making a system that can create, add and maintain crawlers for + RSS feeds} +\end{center} + +The short answer to the problem statement made in the introduction is yes. We +can shorten the loop for repairing and adding crawlers which such a system. The +system we have built is tested and can provide the necessary for a user with no +particular programming skills to generate crawlers and thus the number of +interventions where a programmer is needed is greatly reduced. +Although we have solved the problem we stated the results are not purely +positive. For a problem to be solved the problem must be present. + Although the research question is answered the underlying goal of the project -is not achieved. The application is a intuitive system that allows users to -manage RSS crawlers. With the application it is easy to generate, change, test -and remove crawlers. However while trying real world data we stumbled upon a -problem. Lack of RSS feeds and misuse of RSS feeds. +is not achieved completely. The application is a intuitive system that allows +users to manage RSS crawlers and for the specific domain, RSS feeds, any by +doing that it does shorten the feedback loop but only within the specific +domain. In the testing phase on real world data we stumbled on a small problem. +Lack of RSS feeds and misuse of RSS feeds leads to a domain that is +significantly smaller then first theorized. + +Lack of RSS feeds is a problem because a lot of entertaintment venues have no +RSS feeds available for the public. They are either using different techniques +or they just do not use it at all. This shrinks the domain quite a lot. Taking +pop music venues as an example. In a certain province of the Netherlands we can +find about $25$ venues that have a website and only $3$ have a RSS feed. +Extrapolating this information combined with information from other regions we +can speculate that less then $10\%$ of the venues use RSS feeds. + +The second problem is misuse of RSS feeds. RSS feeds are due to their +limitations possible fields very structured. We found that a lot of venues +using a RSS feed are not content with the limitations and try to bypass such +limitation by misusing the protocol. A common misuse is to use publication date +as the date of the actual event. When loadig such a RSS feed into a general RSS +feed reader the outcome is very strange because a lot of events will have a +publishing date in the future and therefore messing up the order of +publication. The misplacement of key information leads to lack of key +information in the expected fields and by that lower overall extraction +performance. +The second most occuring common misuse is to use HTML formatted text in the +text fields. The algorithm is designed to detect and extract information via +patterns in plain text and the performance on HTML is very bad compared to +plain text. A text field with HTML is almost useless to gather information +from. Via a small study on available RSS feeds we found that about $50\%$ of +the RSS feeds misuse the protocol in such a way that extraction of data is +almost impossible. This reduces the domain of good RSS feeds to less then $5\%$ +of the venues. -\section{Discussion} +\section{Discussion \& Future Research} \label{sec:discuss} +% low level stuff +The application we created does not apply any techniques on the isolated +chunks. The application is built only to extract and not to process the labeled +chunks of text. When this processing does get combined information is added to +the data at least two things get better. A higher level of performance can be +reached due to semantic knowledge as extra constraint while matching the data. +Also quicker error detection in the crawlers is possible because when the match +is correct at a higher level it can still contain wrong information at the +lower chunk level, applying matching techniques on the chunks afterwards can +generate feedback that could also be usefull for the top level of data +extraction. -\begin{itemize} - \item No low level stuff, future research - \item RSS not that great of a source, - \item Expand technique to HTML, reuse interface, defining patterns - The interface for managing the crawlers works very intuitive and therefore - this system could be extended with a dedicated HTML crawler generation - module. The current method for extracting the information is not very - suitable for HTML but due to the modularity of the program a module can be - easily implemented to incorporate another technique in the application. - \item \textbf{Combine RSS and HTML}\\ - A solution for bridging the gap between HTML and RSS could be a software - solution that can convert HTML to RSS feeds that can be fed to the existing - application. When HTML sites are of a certain structure, namely that with - news articles created by a CMS, they can be converted to RSS by flattening - out the structure and create the specified fields of information of RSS - entries. In this way the current application can be used to also process - possibly complicated HTML sources. -\end{itemize} +% combine RSS HTML +Another use or improvement could be combining the forces of HTML and RSS. Some +specifically structured HTML sources could be converted to a RSS feed and still +get procces by the application. In this way, with an extra intermediate step, +the extraction techniques can still be used. HTML sources most likely have to +be generated because there has to be a very consistent structure in the data. +Websites with such great structure are usually generated from a CMS. This will +enlarge the domain for the application significantly since almost all websites +use CMS to publish their data. When conversion between HTML and RSS feeds is +not possible but one has a technique to extract patterns in a similar way then +this application it is also possible to embed it in the current application. +Due to the modularity of the application extending the application is very +easy. diff --git a/thesis2/5.appendices.tex b/thesis2/5.appendices.tex index dca472e..952575e 100644 --- a/thesis2/5.appendices.tex +++ b/thesis2/5.appendices.tex @@ -3,7 +3,12 @@ \label{pseudodawg} \caption{Graph minimization algorithm} \begin{minted}[mathescape=true,linenos=true]{text} - register:=$\emptyset$ +register:=$\emptyset$ +while there is another word + word = next word + commonprefix = CommonPrefix(word) + laststate = $\delta^*(q_0, $commonprefix) + currentsuffix \end{minted} \end{listing} diff --git a/thesis2/Makefile b/thesis2/Makefile index 9b75466..29934ec 100644 --- a/thesis2/Makefile +++ b/thesis2/Makefile @@ -1,5 +1,5 @@ SHELL:=/bin/bash -VERSION:=0.5 +VERSION:=0.6 all: thesis diff --git a/thesis2/notes.txt b/thesis2/notes.txt new file mode 100644 index 0000000..4db7eb6 --- /dev/null +++ b/thesis2/notes.txt @@ -0,0 +1,3 @@ +http://www.devorstin.nl/lib/services/rss/?type=agenda +http://www.vriendenvandebakkerij.nl/feed/ +http://www.paradiso.nl/ -- 2.20.1