update stuff

author Mart Lubbers <mart@martlubbers.net>

Fri, 28 Nov 2014 11:50:28 +0000 (12:50 +0100)

committer Mart Lubbers <mart@martlubbers.net>

Fri, 28 Nov 2014 11:50:28 +0000 (12:50 +0100)
author Mart Lubbers <mart@martlubbers.net>
Fri, 28 Nov 2014 11:50:28 +0000 (12:50 +0100)
committer Mart Lubbers <mart@martlubbers.net>
Fri, 28 Nov 2014 11:50:28 +0000 (12:50 +0100)
diff --git a/program/everything/xml/scheme.xsd b/program/everything/xml/scheme.xsd

index 56834a0..9fec273 100644 (file)
--- a/program/everything/xml/scheme.xsd
+++ b/program/everything/xml/scheme.xsd
@@ -1,11 +1,14 @@
  <?xml version="1.0" encoding="UTF-8"?>
  <?xml-stylesheet type="text/xsl" href="xs3p.xsl"?>
-<xs:schema attributeFormDefault="unqualified" elementFormDefault="qualified" xmlns:xs="http://www.w3.org/2001/XMLSchema">
+<xs:schema attributeFormDefault="unqualified" elementFormDefault="qualified"
+       xmlns:xs="http://www.w3.org/2001/XMLSchema">
  <!--This is the main element, required. It contains crawler and/data entries-->
    <xs:element name="crawler_output">
      <xs:complexType>
        <xs:sequence>
-<!--Crawler entries contain the information of the crawler, there can be multiple-->
+<!--
+Crawler entries contain the information of the crawler, there can be multiple
+-->
          <xs:element name="crawler" maxOccurs="unbounded" minOccurs="0">
            <xs:complexType>
              <xs:simpleContent>
@@ -21,7 +24,10 @@
              </xs:simpleContent>
            </xs:complexType>
          </xs:element>
-<!--Data entries contain the information of a single crawled entry, there can be multiple-->
+<!--
+Data entries contain the information of a single crawled entry, there can be
+multiple
+-->
          <xs:element name="data" maxOccurs="unbounded" minOccurs="0">
            <xs:complexType>
              <xs:sequence>
@@ -37,13 +43,17 @@
                      <xs:element type="xs:string" name="full_title"/>
                      <xs:element type="xs:string" name="full_summary"/>
  <!--These fields contain some other information from the rss-->
-                    <xs:element type="xs:anyURI" name="link" maxOccurs="unbounded" minOccurs="0"/>
-                    <xs:element type="xs:string" name="pub_date" maxOccurs="unbounded" minOccurs="0"/>
+                                                                               <xs:element type="xs:anyURI" name="link"
+                                                                                       maxOccurs="unbounded" minOccurs="0"/>
+                                                                               <xs:element type="xs:string" name="pub_date"
+                                                                                       maxOccurs="unbounded" minOccurs="0"/>
  <!--Extracted URIs is a list of urls, this can be empty-->
-                    <xs:element name="extracted_uris" maxOccurs="unbounded" minOccurs="0">
+                                                                               <xs:element name="extracted_uris" maxOccurs="unbounded"
+                                                                                       minOccurs="0">
                        <xs:complexType>
                          <xs:sequence>
-                          <xs:element type="xs:anyURI" name="url" maxOccurs="unbounded" minOccurs="0"/>
+                                                                                                       <xs:element type="xs:anyURI" name="url"
+                                                                                                               maxOccurs="unbounded" minOccurs="0"/>
                          </xs:sequence>
                        </xs:complexType>
                      </xs:element>
diff --git a/thesis2/1.introduction.tex b/thesis2/1.introduction.tex

index e07464d..a055d17 100644 (file)
--- a/thesis2/1.introduction.tex
+++ b/thesis2/1.introduction.tex
@@ -159,17 +159,17 @@ different crawlers then goes to the \textit{Temporum}.
  \paragraph{Temporum}
  The \textit{Temporum} is a big bin that contains raw data extracted from
  different sources and has to be post processed to be suitable enough for the
-actual database. This post processing encompasses several possible tasks.
+actual database. This processing encompasses several possible tasks.
  
  The first task is to check the validity of the entry. This is a very shallow
  test to check if the crawler is not malfunctioning and there is no nonsense in
-the data.
-
-The second step is matching the entry to several objects. For example the entry
-has to be matched to a certain venue when its source is a ticket vendor who
-sells tickets for multiple venues. Another example is that the event is a pop
-concert and is part of a big tour. Many of these tasks are done alone by or
-with aid of a computer program. Almost no data is going straight through the
+the data. Most of the data is not directly checked for validity, the data is
+skimmed for strange things but not every datapoint is checked.  The second step
+is matching the entry to several objects. For example the entry has to be
+matched to a certain venue when its source is a ticket vendor who sells tickets
+for multiple venues. Another example is that the event is a pop concert and is
+part of a big tour. Many of these tasks are done alone by or with aid of a
+computer program. Almost no data is going straight through the
  \textit{Temporum} and this property makes the \textit{Temporum} a safety net
  for all the incoming data.
  
@@ -192,11 +192,11 @@ expensive and has a long feedback loop. When a source changes it is first
  preprocessed in the old way, send to the \textit{Temporum} and checked by a
  human and matched. The human then notices the error in the data and contacts
  the programmer. The programmer then has to reprogram the specific crawler to
-the new structure. This feedback loop, shown in Figure~\ref{fig:1.2.1} can take
-days and can be the reason for gaps or faulty information in the database. 
+the new structure. This feedback loop, shown in Figure~\ref{feedbackloop} can
+take days and can be the reason for gaps or faulty information in the database. 
  \begin{figure}[H]
         \caption{Feedback loop for malfunctioning crawlers}
-       \label{fig:1.1.2}
+       \label{feedbackloop}
         \centering
         \scalebox{0.8}{
                 \digraph[]{graph112}{
@@ -222,31 +222,40 @@ days and can be the reason for gaps or faulty information in the database.
  The goal of the project is to relieve the programmer of repairing crawlers all
  the time and make the task of adapting, editing and removing crawlers doable
  for someone without programming experience. In practice this means in
-Figure~\ref{fig:1.1.2} removing the dotted arrows by dashed arrow.
+Figure~\ref{feedbackloop} removing the dotted arrows by dashed arrow.
  
  For this project an application has been developed that can provide an
-interface to a crawler system that is able to crawl
-RSS\footnote{\url{http://rssboard.org/rss-specification}} and
-Atom\footnote{\url{http://tools.ietf.org/html/rfc5023}} publishing feeds.
-The interface also provides the user with point and click interfaces to create,
-modify, test and remove crawlers. The Hyperleap back end can, via this
-interface, generate XML feeds that contain the crawled data. For editing the
-structure and contents of the program a programmer is in theory also not
-necessary because all the things someone wants to change are located in a
-single file that is human readable. In practice it means that one person, not
-by definition a programmer, can be instructed to change the structure and this
-can also greatly reduce programmer intervention time. 
+interface to a crawler system that is able to crawl RSS\cite{Rss} and
+Atom\cite{Atom} publishing feeds.  The interface also provides the user with
+point and click interfaces to create, modify, test and remove crawlers. The
+Hyperleap back end can, via this interface, generate XML feeds that contain the
+crawled data. For editing the structure and contents of the program a
+programmer is in theory also not necessary because all the things someone wants
+to change are located in a single file that is human readable. In practice it
+means that one person, not by definition a programmer, can be instructed to
+change the structure and this can also greatly reduce programmer intervention
+time. 
  
  \section{RSS/Atom}
-RSS/Atom feeds, from now on called RSS feeds, are publishing
-feeds in the XML format\footnote{\url{http://www.w3.org/TR/REC-xml/}} that are
-used to publish events. Every event or entry consists of several standardized
-fields. The main data fields are the \textit{title} and the
-\textit{description} field. In these fields the raw data is stored that
+RSS/Atom feeds, from now on called RSS feeds, are publishing feeds in the XML
+format\cite{Xml} that are used to publish events. Every event or entry consists
+of several standardized fields. The main data fields are the \textit{title} and
+the \textit{description} field. In these fields the raw data is stored that
  describes the event. Further there are several auxiliary fields that store the
  link to the full article, store the publishing data, store some media in the
  form of an image or video URL and store a \textit{Globally Unique
-Identifier}(GUID).
+Identifier}(GUID)\footnote{A GUID is a unique identifier that in most cases is
+the permalink of the article. A permalink is a link that will point to the
+article}. An example of a RSS feed can be found in Listing~\ref{examplerss},
+this listing shows a partly truncated RSS feed of a well known venue in the
+Netherlands.
+
+\begin{listing}
+       \caption{An example of a, partly truncated RSS feed of a well known dutch
+       venue}
+       \label{examplerss}
+       \xmlcode{exrss.xml}
+\end{listing}
  
  The RSS feeds are mostly used by news sites to publish their articles, the feed
  then only contains the headlines and if the user that is reading the feed is
@@ -266,29 +275,28 @@ website background be able to post news items and event information.
  
  \section{Why RSS?}
  \label{sec:whyrss}
-Information from venues comes in various different format with for each format
-several positive and negative points. For this project we chose to focus on
-RSS feeds. RSS feeds are, as explained earlier, very structured and consistent
-in their structure. The structure basically only changes if the CMS changes or
-upgrades. Because of this patterns don't have to be retrained a lot.
-
-In comparison to websites RSS feeds don't have a structural dimension in
-the data, all the information in an entry is basically two fields of plain
-text. Therefore an entirely new strategy has to be applied to train the
-patterns.
-
-\section{Scientific relevance and similar research}
-Currently the techniques for conversion from non structured data to structured
-data are static and mainly only usable by computer science experts. There is a
-great need of data mining in non structured data because the data within
-companies and on the internet is piling up and are usually left to catch dust.
-
-The project is a followup project of the past project done by Roelofs et
-al.\cite{Roelofs2008} and \cite{Roelofs2009}. Roelofs et al. described
-techniques on recognizing patterns in website data and published about an
-adapted levenstein distance algorithm to recognize data in isolated text. These
-techniques of recognizing data in text can still be used to interpret the
-isolated extracted parts from this project.
+The first proposal described formats like HTML, fax/email, RSS, XML and some
+more. Because of the scope of the project and the time planned for it we had to
+remove some of the input formats because they all require different techniques.
+For example when the input source is in HTML format, most probably a website,
+then there can be a great deal of information extraction be automated using the
+structural information which is characteristic for HTML. For fax/email however
+there is almost no structural information and most of the automation techniques
+require natural language processing. We chose RSS feeds because RSS feeds lack
+inherent structural information but are still very structured. This structure
+is because, as said above, the RSS feeds are generated and therefore almost
+always look the same. Also, in RSS feeds most venues use particular structural
+identifiers that are characters. They separate fields with vertical bars,
+commas, whitespace and more non text characters. These field separators and
+keywords can be hard for a computer to detect but people are very good in
+detecting these. With one look they can identify the characters and keywords
+and build a pattern in their head.
+Another reason we chose RSS is their temporal consistency, RSS feeds are almost
+always generated and because of that the structure of the entries is very
+unlikely to change. Basically the RSS feeds only change structure when the CMS
+that generates it changes the generation algorithm. This property is usefull
+because the crawlers then do not have to be retrained very often. Because the
+non inherent structure entirely new strategies had to be applied.
  
  \section{Directed Acyclic Graphs}
  \paragraph{Normal graphs}
@@ -296,12 +304,12 @@ A graph($G$) is a mathematical structure to describe relations between nodes
  with edges. A standard graph is defined as the ordered pair: $G=(V,E)$.  In
  this ordered pair $V$ is the set of nodes and $E$ is set of undirected edges
  where every undirected edge is a tuple of two nodes.
-Figure~\ref{fig:graphexample} is specified as: 
+Figure~\ref{graphexample} is specified as: 
         $$G=(\{n1, n2, n3\}, \{(n1, n2), (n2, n3), (n2, n3)\})$$
  
  \begin{figure}[H]
         \caption{Example Graph}
-       \label{fig:graphexample}
+       \label{graphexample}
         \centering
         \digraph[]{graphexample}{
                 rankdir=LR
@@ -318,13 +326,13 @@ still the edges but the inherent difference is that the edges are ordered
  tuples in stead of not ordered. Adding this property gives the edges a
  direction. Every edge has a specific start and end and are therefore called
  directed edges. A directed graph would look the same as the graph in
-Figure~\ref{fig:graphexample} but then the normal edges would be replaced by
+Figure~\ref{graphexample} but then the normal edges would be replaced by
  directional arrows that specifically go from one node to the other.
  
  \paragraph{Directed acyclic graphs}
  Directed Acyclic Graphs(DAGs) are a special kind of directed graphs. DAGs
  are also defined as $G=(V,E)$ but with a restriction on $E$. Namely that cycles
-are not allowed. Figure~\ref{fig:dagexample} shows two graphs. The left graph
+are not allowed. Figure~\ref{dagexample} shows two graphs. The left graph
  contains a cycle and the right graph does not. Only the right graph is a valid
  DAG. A cycle is defined by a sequence of edges where nodes are visited more
  then once. Adding the property of non-cyclicity to graphs lowers the
@@ -333,7 +341,7 @@ graph to $\mathcal{O}(L)$ where $L$ is the length of the sequence.
  
  \begin{figure}[H]
         \caption{Example DAG}
-       \label{fig:dagexample}
+       \label{dagexample}
         \centering
         \digraph[]{dagexample}{
                 rankdir=LR
@@ -348,14 +356,24 @@ graph to $\mathcal{O}(L)$ where $L$ is the length of the sequence.
  
  \paragraph{Directed Acyclic Word Graphs}
  The type of graph used in the project is a special kind of DAG called
-Directed Acyclic Word Graphs(DAWGs). A DAWG can be defined by the ordered pair
-$G=(V,E)$ and is the same as a directed graph except for the edges. An edge in
-a DAWG is instead of a tuple a triple and consist of a starting point, an end
-point and a label. Because of the property of labeled edges data can be stored
+Directed Acyclic Word Graphs(DAWGs). A DAWG can be defined by the ordered
+triple $G=(V,E,F)$.
+$V$ is the same as in directed graphs. $E$ is also the same besides the fact
+that the ordered pair of nodes that describes the edge it now is a triple
+consisting of a start node, an end node and a label. In a standard DAWG the
+label is always one character. $F$ describes the set of final nodes, final
+nodes are nodes that can be the end of a sequence even if another arrow leads
+out. In the example graph in Figure~\ref{exampledawg} the final nodes are
+visualized with a double circle as node shape. In this example it is purely
+cosmetic because $n6$ is a final node anyways because there are no arrows
+leading out. But for example in  $G=(\{n1, n2, n3\}, \{(n1, n2), (n2, n3)\},
+\{n2, n3\})$ there is a distinct use for the final node marking. Graph $G$
+accepts the words \textit{a,ab} and to simplify the graph node $n2$ and $n3$
+are final. Because of the property of labeled edges data can be stored
  in a DAWG. When traversing a DAWG and saving all the edgelabels one can
  construct words. Because of properties of graphs a DAWG can store big sets of
  words in a small amount of storage because it can re-use edges to specify
-transitions. For example the graph in Figure~\ref{fig:2.1.1} can describe a
+transitions. For example the graph in Figure~\ref{exampledawg} can describe a
  dictionary that holds the words: \textit{abd, bad, bae}. Testing if a word is
  present in the DAWG is, just as for a DAG, falls in the computational
  complexity class of $\mathcal{O}(L)$ meaning that it grows linearly with the
@@ -363,18 +381,18 @@ length of the word.
  
  \begin{figure}[H]
         \caption{Example DAWG}
-       \label{fig:2.1.1}
+       \label{exampledawg}
         \centering
         \digraph[]{graph21}{
                 rankdir=LR;
-               1,2,3,4,5 [shape="circle"];
-               6 [shape="doublecircle"];
-               1 -> 2 [label="a"];
-               2 -> 3 [label="b"];
-               3 -> 6 [label="d"];
-               1 -> 4 [label="b"];
-               4 -> 5 [label="a"];
-               5 -> 6 [label="d"];
-               5 -> 6 [label="e"];
+               n1,n2,n3,n4,n5 [shape="circle"];
+               n6 [shape="doublecircle"];
+               n1 -> n2 [label="a"];
+               n2 -> n3 [label="b"];
+               n3 -> n6 [label="d"];
+               n1 -> n4 [label="b"];
+               n4 -> n5 [label="a"];
+               n5 -> n6 [label="d"];
+               n5 -> n6 [label="e"];
         }
  \end{figure}
diff --git a/thesis2/2.requirementsanddesign.tex b/thesis2/2.requirementsanddesign.tex

index db8c35a..bb2aa1f 100644 (file)
--- a/thesis2/2.requirementsanddesign.tex
+++ b/thesis2/2.requirementsanddesign.tex
@@ -110,18 +110,18 @@ specific.
  \subsection{Backend}
  \subsubsection{Program description}
  The backend consists of a main module and a set of libraries all written in
-\textit{Python}\footnote{\url{https://www.python.org/}}. The main module can,
+\textit{Python}\cite{Python}. The main module can,
  and is, be embedded in an apache
  webserver\footnote{\url{https://httpd.apache.org/}} via the
-\textit{mod\_python} apache module\footnote{\url{http://modpython.org/}}.  The
-module \textit{mod\_python} allows the webserver to execute \textit{Python}
-code in the webserver. We chose Python because of the rich set of standard
-libraries and solid cross platform capabilities. We chose Python 2 because it
-is still the default Python version on all major operating systems and stays
-supported until at least the year 2020 meaning that the program can function
-safe at least 5 full years. The application consists of a main Python module
-that is embedded in the webserver. Finally there are some libraries and there
-is a standalone program that does the periodic crawling.
+\textit{mod\_python} apache module\cite{Modpython}. The module
+\textit{mod\_python} allows the webserver to execute Python code in
+the webserver. We chose Python because of the rich set of standard libraries
+and solid cross platform capabilities. We chose Python 2 because it is still
+the default Python version on all major operating systems and stays supported
+until at least the year 2020 meaning that the program can function safe at
+least 5 full years. The application consists of a main Python module that is
+embedded in the webserver. Finally there are some libraries and there is a
+standalone program that does the periodic crawling.
  
  \subsubsection{Main module}
  The main module is the program that deals with the requests, controls the
@@ -150,5 +150,14 @@ The crawler saves all the data in a database. The database is a simple
  dictionary where all the entries are hashed so that the crawler knows which
  ones are already present in the database and which ones are new so that it
  does not have to process all the old entries when they appear in the feed. The
-crawler also has a function to export the database to XML format. The XML
-format is specified in an XSD file for minimal ambiguity. 
+RSS's GUID could also have been used but since it is an optional value in the
+feed not every feed uses the GUID and therefore it is not reliable to use it.
+The crawler also has a function to export the database to XML format. The XML
+format is specified in an XSD\cite{Xsd} file for minimal ambiguity. 
+
+An XSD file is a file that precisely describes the field the XML file uses.
+As almost every programming language contains an XML library most of the
+languages also contain a XSD library that allows the XML library to parse files
+according to the scheme and in that way the programmer knows exactly what to
+expect as XML fields. The XSD file for this program can be found in the
+appendices in Listing~\ref{scheme.xsd}.
diff --git a/thesis2/6.appendices.tex b/thesis2/6.appendices.tex

index ee8c8b9..20fc864 100644 (file)
--- a/thesis2/6.appendices.tex
+++ b/thesis2/6.appendices.tex
@@ -1,2 +1,6 @@
-       \section{Algorithm}
-       \section{Progress}
+\section{Algorithm}
+\section{Schemes}
+\subsection{scheme.xsd}
+\label{scheme.xsd}
+\xmlcode{scheme1.xsd}
+\xmlcode[firstnumber=51]{scheme2.xsd}
diff --git a/thesis2/Makefile b/thesis2/Makefile

index d68058a..9877cdb 100644 (file)
--- a/thesis2/Makefile
+++ b/thesis2/Makefile
@@ -4,13 +4,15 @@ VERSION:=0.3
  all: thesis
  
  thesis:
-       latex thesis.tex > log.txt
+       head -50 scheme.xsd > scheme1.xsd
+       tail -n +49 scheme.xsd > scheme2.xsd
+       latex -shell-escape thesis.tex > log.txt
         bibtex thesis.aux >> log.txt
         latex -shell-escape thesis.tex >> log.txt
-       latex thesis.tex >> log.txt
+       latex -shell-escape thesis.tex >> log.txt
         dvipdfm thesis.dvi >> log.txt 2>&1
         mv -v {thesis,mart_thesis_$(VERSION)}.pdf
  
  clean:
-       rm -vf *.{aux,bbl,blg,dvi,log,out,pdf,toc,dot,ps} log.txt
+       rm -vf *.{aux,bbl,blg,dvi,log,out,pdf,toc,dot,ps} log.txt scheme[12].xsd
  
diff --git a/thesis2/exrss.xml b/thesis2/exrss.xml

new file mode 100644 (file)

index 0000000..3e57bc9
--- /dev/null
+++ b/thesis2/exrss.xml
@@ -0,0 +1,24 @@
+<?xml version="1.0" ?>
+<rss version="2.0">
+       <channel>
+               <title>Nieuw in de voorverkoop Paradiso</title>
+               <link>http://www.paradiso.nl/web/show/id=178182</link>
+               <description></description>
+                       <item>
+                               <title>donderdag 8 januari 2015 22:00 - Tee Pee Records Night - live:
+       Harsh Toke, Comet Control</title>
+                               <link>http://www.paradiso.nl/web/Agenda-Item/Tee-Pee-Records-Night-
+       live-Harsh-Toke-Comet-Control.htm</link>
+                               <description></description>
+                               <pubDate>do, 27 nov 2014 11:34:00 GMT</pubDate>
+                       </item>
+                       <item>
+                               <title>vrijdag 20 maart 2015 22:00 - Atanga Boom - cd release</title>
+                               <link>
+       http://www.paradiso.nl/web/Agenda-Item/Atanga-Boom-cd-release.htm
+                               </link>
+                               <description></description>
+                               <pubDate>do, 27 nov 2014 10:34:00 GMT</pubDate>
+                       </item>
+                       <item>
+                               <title>zaterdag 21 maart 2015 20:00 - ...
diff --git a/thesis2/scheme.xsd b/thesis2/scheme.xsd

new file mode 120000 (symlink)

index 0000000..4ee7d39
--- /dev/null
+++ b/thesis2/scheme.xsd
@@ -0,0 +1 @@
+../program/everything/xml/scheme.xsd
+\ No newline at end of file
diff --git a/thesis2/scheme1.xsd b/thesis2/scheme1.xsd

new file mode 100644 (file)

index 0000000..55be89e
--- /dev/null
+++ b/thesis2/scheme1.xsd
@@ -0,0 +1,50 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-stylesheet type="text/xsl" href="xs3p.xsl"?>
+<xs:schema attributeFormDefault="unqualified" elementFormDefault="qualified"
+       xmlns:xs="http://www.w3.org/2001/XMLSchema">
+<!--This is the main element, required. It contains crawler and/data entries-->
+  <xs:element name="crawler_output">
+    <xs:complexType>
+      <xs:sequence>
+<!--
+Crawler entries contain the information of the crawler, there can be multiple
+-->
+        <xs:element name="crawler" maxOccurs="unbounded" minOccurs="0">
+          <xs:complexType>
+            <xs:simpleContent>
+              <xs:extension base="xs:string">
+                <xs:attribute type="xs:string" name="name" use="optional"/>
+                <xs:attribute type="xs:string" name="venue" use="optional"/>
+                <xs:attribute type="xs:string" name="freq" use="optional"/>
+                <xs:attribute type="xs:string" name="def_loc" use="optional"/>
+                <xs:attribute type="xs:string" name="adress" use="optional"/>
+                <xs:attribute type="xs:anyURI" name="website" use="optional"/>
+                <xs:attribute type="xs:anyURI" name="url" use="optional"/>
+              </xs:extension>
+            </xs:simpleContent>
+          </xs:complexType>
+        </xs:element>
+<!--
+Data entries contain the information of a single crawled entry, there can be
+multiple
+-->
+        <xs:element name="data" maxOccurs="unbounded" minOccurs="0">
+          <xs:complexType>
+            <xs:sequence>
+              <xs:element name="entry">
+                <xs:complexType>
+                  <xs:sequence>
+<!--These four fields contain the user data-->
+                    <xs:element type="xs:string" name="where"/>
+                    <xs:element type="xs:string" name="what"/>
+                    <xs:element type="xs:string" name="date"/>
+                    <xs:element type="xs:string" name="time"/>
+<!--These fields contain the raw original title and summary-->
+                    <xs:element type="xs:string" name="full_title"/>
+                    <xs:element type="xs:string" name="full_summary"/>
+<!--These fields contain some other information from the rss-->
+                                                                               <xs:element type="xs:anyURI" name="link"
+                                                                                       maxOccurs="unbounded" minOccurs="0"/>
+                                                                               <xs:element type="xs:string" name="pub_date"
+                                                                                       maxOccurs="unbounded" minOccurs="0"/>
+<!--Extracted URIs is a list of urls, this can be empty-->
diff --git a/thesis2/scheme2.xsd b/thesis2/scheme2.xsd

new file mode 100644 (file)

index 0000000..abc2d3f
--- /dev/null
+++ b/thesis2/scheme2.xsd
@@ -0,0 +1,24 @@
+                                                                                       maxOccurs="unbounded" minOccurs="0"/>
+<!--Extracted URIs is a list of urls, this can be empty-->
+                                                                               <xs:element name="extracted_uris" maxOccurs="unbounded"
+                                                                                       minOccurs="0">
+                      <xs:complexType>
+                        <xs:sequence>
+                                                                                                       <xs:element type="xs:anyURI" name="url"
+                                                                                                               maxOccurs="unbounded" minOccurs="0"/>
+                        </xs:sequence>
+                      </xs:complexType>
+                    </xs:element>
+                  </xs:sequence>
+                </xs:complexType>
+              </xs:element>
+            </xs:sequence>
+<!--These fields specify the crawler name and the date crawled-->
+            <xs:attribute type="xs:string" name="from"/>
+            <xs:attribute type="xs:dateTime" name="date"/>
+          </xs:complexType>
+        </xs:element>
+      </xs:sequence>
+    </xs:complexType>
+  </xs:element>
+</xs:schema>
diff --git a/thesis2/thesis.bib b/thesis2/thesis.bib

index ef9957f..126f710 100644 (file)
--- a/thesis2/thesis.bib
+++ b/thesis2/thesis.bib
@@ -125,3 +125,40 @@ author = {Watson, Bruce W and Science, Computing and Eindhoven, Technische Unive
  file = {:home/mart/articles/Unknown/Watson, Science, Eindhoven\_Constructing minimal acyclic deterministic finite automata.pdf:pdf},
  title = {{Constructing minimal acyclic deterministic finite automata}}
  }
+@book{Python,
+       author = {Rossem, G. Van and Drake, F.L. Drake (eds)},
+       title = {Python Reference Manual},
+       publisher = {PythonLabs, Virginia, USA},
+       year = {2001},
+       howpublished = {Available at http://www.python.org}
+}
+@misc{Modpython,
+       title={mod python: Apache/Python Integration},
+       author={Trubetskoy, G}
+}
+@article{Rss,
+       title={RSS 2.0 Specification},
+       author={Board, RSS Advisory},
+       journal={Web available},
+       year={2007}
+}
+@article{Atom,
+       title={The atom syndication format},
+       author={Nottingham, Mark and Sayre, Robert},
+       year={2005}
+}
+@article{Xml,
+       title={Extensible markup language (XML)},
+       author={Bray, Tim and Paoli, Jean and Sperberg-McQueen, C Michael and
+               Maler, Eve and Yergeau, Fran{\c{c}}ois},
+       journal={World Wide Web Consortium Recommendation REC-xml-19980210.
+               http://www. w3. org/TR/1998/REC-xml-19980210},
+       year={1998}
+}
+@article{adler2001extensible,
+       title={Extensible Stylesheet Language (XSL)-Version 1.0},
+       author={Adler, Sharon and Milowski, Alex and Richman, Jeremy and Zilles,
+               Steve and others},
+       year={2001},
+       publisher={Citeseer}
+}
diff --git a/thesis2/thesis.pyg b/thesis2/thesis.pyg

new file mode 100644 (file)

index 0000000..fc7be3e
--- /dev/null
+++ b/thesis2/thesis.pyg
@@ -0,0 +1,95 @@
+
+\makeatletter
+\def\PY@reset{\let\PY@it=\relax \let\PY@bf=\relax%
+    \let\PY@ul=\relax \let\PY@tc=\relax%
+    \let\PY@bc=\relax \let\PY@ff=\relax}
+\def\PY@tok#1{\csname PY@tok@#1\endcsname}
+\def\PY@toks#1+{\ifx\relax#1\empty\else%
+    \PY@tok{#1}\expandafter\PY@toks\fi}
+\def\PY@do#1{\PY@bc{\PY@tc{\PY@ul{%
+    \PY@it{\PY@bf{\PY@ff{#1}}}}}}}
+\def\PY#1#2{\PY@reset\PY@toks#1+\relax+\PY@do{#2}}
+
+\expandafter\def\csname PY@tok@gd\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.63,0.00,0.00}{##1}}}
+\expandafter\def\csname PY@tok@gu\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.50,0.00,0.50}{##1}}}
+\expandafter\def\csname PY@tok@gt\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.00,0.27,0.87}{##1}}}
+\expandafter\def\csname PY@tok@gs\endcsname{\let\PY@bf=\textbf}
+\expandafter\def\csname PY@tok@gr\endcsname{\def\PY@tc##1{\textcolor[rgb]{1.00,0.00,0.00}{##1}}}
+\expandafter\def\csname PY@tok@cm\endcsname{\let\PY@it=\textit\def\PY@tc##1{\textcolor[rgb]{0.25,0.50,0.50}{##1}}}
+\expandafter\def\csname PY@tok@vg\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.10,0.09,0.49}{##1}}}
+\expandafter\def\csname PY@tok@m\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
+\expandafter\def\csname PY@tok@mh\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
+\expandafter\def\csname PY@tok@go\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.53,0.53,0.53}{##1}}}
+\expandafter\def\csname PY@tok@ge\endcsname{\let\PY@it=\textit}
+\expandafter\def\csname PY@tok@vc\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.10,0.09,0.49}{##1}}}
+\expandafter\def\csname PY@tok@il\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
+\expandafter\def\csname PY@tok@cs\endcsname{\let\PY@it=\textit\def\PY@tc##1{\textcolor[rgb]{0.25,0.50,0.50}{##1}}}
+\expandafter\def\csname PY@tok@cp\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.74,0.48,0.00}{##1}}}
+\expandafter\def\csname PY@tok@gi\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.00,0.63,0.00}{##1}}}
+\expandafter\def\csname PY@tok@gh\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.00,0.50}{##1}}}
+\expandafter\def\csname PY@tok@ni\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.60,0.60,0.60}{##1}}}
+\expandafter\def\csname PY@tok@nl\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.63,0.63,0.00}{##1}}}
+\expandafter\def\csname PY@tok@nn\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.00,1.00}{##1}}}
+\expandafter\def\csname PY@tok@no\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.53,0.00,0.00}{##1}}}
+\expandafter\def\csname PY@tok@na\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.49,0.56,0.16}{##1}}}
+\expandafter\def\csname PY@tok@nb\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
+\expandafter\def\csname PY@tok@nc\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.00,1.00}{##1}}}
+\expandafter\def\csname PY@tok@nd\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.67,0.13,1.00}{##1}}}
+\expandafter\def\csname PY@tok@ne\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.82,0.25,0.23}{##1}}}
+\expandafter\def\csname PY@tok@nf\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.00,0.00,1.00}{##1}}}
+\expandafter\def\csname PY@tok@si\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.73,0.40,0.53}{##1}}}
+\expandafter\def\csname PY@tok@s2\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}}
+\expandafter\def\csname PY@tok@vi\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.10,0.09,0.49}{##1}}}
+\expandafter\def\csname PY@tok@nt\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
+\expandafter\def\csname PY@tok@nv\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.10,0.09,0.49}{##1}}}
+\expandafter\def\csname PY@tok@s1\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}}
+\expandafter\def\csname PY@tok@kd\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
+\expandafter\def\csname PY@tok@sh\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}}
+\expandafter\def\csname PY@tok@sc\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}}
+\expandafter\def\csname PY@tok@sx\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
+\expandafter\def\csname PY@tok@bp\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
+\expandafter\def\csname PY@tok@c1\endcsname{\let\PY@it=\textit\def\PY@tc##1{\textcolor[rgb]{0.25,0.50,0.50}{##1}}}
+\expandafter\def\csname PY@tok@kc\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
+\expandafter\def\csname PY@tok@c\endcsname{\let\PY@it=\textit\def\PY@tc##1{\textcolor[rgb]{0.25,0.50,0.50}{##1}}}
+\expandafter\def\csname PY@tok@mf\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
+\expandafter\def\csname PY@tok@err\endcsname{\def\PY@bc##1{\setlength{\fboxsep}{0pt}\fcolorbox[rgb]{1.00,0.00,0.00}{1,1,1}{\strut ##1}}}
+\expandafter\def\csname PY@tok@mb\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
+\expandafter\def\csname PY@tok@ss\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.10,0.09,0.49}{##1}}}
+\expandafter\def\csname PY@tok@sr\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.73,0.40,0.53}{##1}}}
+\expandafter\def\csname PY@tok@mo\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
+\expandafter\def\csname PY@tok@kn\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
+\expandafter\def\csname PY@tok@mi\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
+\expandafter\def\csname PY@tok@gp\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.00,0.50}{##1}}}
+\expandafter\def\csname PY@tok@o\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
+\expandafter\def\csname PY@tok@kr\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
+\expandafter\def\csname PY@tok@s\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}}
+\expandafter\def\csname PY@tok@kp\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
+\expandafter\def\csname PY@tok@w\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.73,0.73,0.73}{##1}}}
+\expandafter\def\csname PY@tok@kt\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.69,0.00,0.25}{##1}}}
+\expandafter\def\csname PY@tok@ow\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.67,0.13,1.00}{##1}}}
+\expandafter\def\csname PY@tok@sb\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}}
+\expandafter\def\csname PY@tok@k\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
+\expandafter\def\csname PY@tok@se\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.73,0.40,0.13}{##1}}}
+\expandafter\def\csname PY@tok@sd\endcsname{\let\PY@it=\textit\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}}
+
+\def\PYZbs{\char`\\}
+\def\PYZus{\char`\_}
+\def\PYZob{\char`\{}
+\def\PYZcb{\char`\}}
+\def\PYZca{\char`\^}
+\def\PYZam{\char`\&}
+\def\PYZlt{\char`\<}
+\def\PYZgt{\char`\>}
+\def\PYZsh{\char`\#}
+\def\PYZpc{\char`\%}
+\def\PYZdl{\char`\$}
+\def\PYZhy{\char`\-}
+\def\PYZsq{\char`\'}
+\def\PYZdq{\char`\"}
+\def\PYZti{\char`\~}
+% for compatibility with earlier versions
+\def\PYZat{@}
+\def\PYZlb{[}
+\def\PYZrb{]}
+\makeatother
+
diff --git a/thesis2/thesis.tex b/thesis2/thesis.tex

index e58c326..794884d 100644 (file)
--- a/thesis2/thesis.tex
+++ b/thesis2/thesis.tex
@@ -5,28 +5,34 @@
  \usepackage{courier}
  \usepackage{graphicx}  % Images
  \usepackage{float}     % Better placement float figures
-\usepackage{listings}  % Source code formatting
+\usepackage{minted}
  \usepackage[dvipdfmx,hidelinks]{hyperref}  % Hyperlinks
-\usepackage{tikz}      % Sequence diagrams
-\usepackage{pgf-umlsd} %
  \usepackage{graphviz}  % For the DAG diagrams
  \usepackage{amssymb}
  \usepackage{marvosym}
-\usepgflibrary{arrows} %
  
  % Set listings settings
-\lstset{
-       basicstyle=\scriptsize,
-       breaklines=true,
-       numbers=left,
-       numberstyle=\tiny,
-       tabsize=2
-}
-\lstdefinestyle{custompy}{
-       language=python,
-       keepspaces=true,
-       columns=flexible,
-       showspaces=false
+\definecolor{mintedbackground}{rgb}{0.95,0.95,0.95}
+\newmintedfile[xmlcode]{xml}{
+       bgcolor=mintedbackground,
+       fontfamily=tt,
+       fontsize=\scriptsize,
+       frame=leftline,
+       framerule=0.4pt,
+       framesep=2mm,
+       funcnamehighlighting=true,
+       gobble=0,
+       linenos=true,
+       mathescape=false
+       numberblanklines=true,
+       numbersep=10pt,
+       numbersep=5pt,
+       obeytabs=false,
+       showspaces=false,
+       samepage=false,
+       showtabs =false,
+       tabsize=1,
+       texcl=false,
  }
  
  \newcommand{\cvartitle}{Non IT configurable adaptive data mining solution used
author	Mart Lubbers <mart@martlubbers.net>
	Fri, 28 Nov 2014 11:50:28 +0000 (12:50 +0100)
committer	Mart Lubbers <mart@martlubbers.net>
	Fri, 28 Nov 2014 11:50:28 +0000 (12:50 +0100)
program/everything/xml/scheme.xsd		patch \| blob \| history
thesis2/1.introduction.tex		patch \| blob \| history
thesis2/2.requirementsanddesign.tex		patch \| blob \| history
thesis2/6.appendices.tex		patch \| blob \| history
thesis2/Makefile		patch \| blob \| history
thesis2/exrss.xml	[new file with mode: 0644]	patch \| blob
thesis2/scheme.xsd	[new symlink]	patch \| blob
thesis2/scheme1.xsd	[new file with mode: 0644]	patch \| blob
thesis2/scheme2.xsd	[new file with mode: 0644]	patch \| blob
thesis2/thesis.bib		patch \| blob \| history
thesis2/thesis.pyg	[new file with mode: 0644]	patch \| blob
thesis2/thesis.tex		patch \| blob \| history