0.4

author Mart Lubbers <mart@martlubbers.net>

Tue, 2 Dec 2014 13:35:10 +0000 (14:35 +0100)

committer Mart Lubbers <mart@martlubbers.net>

Tue, 2 Dec 2014 13:35:10 +0000 (14:35 +0100)
author Mart Lubbers <mart@martlubbers.net>
Tue, 2 Dec 2014 13:35:10 +0000 (14:35 +0100)
committer Mart Lubbers <mart@martlubbers.net>
Tue, 2 Dec 2014 13:35:10 +0000 (14:35 +0100)
diff --git a/thesis2/1.introduction.tex b/thesis2/1.introduction.tex

index a055d17..81f09fe 100644 (file)
--- a/thesis2/1.introduction.tex
+++ b/thesis2/1.introduction.tex
@@ -56,16 +56,16 @@ As said before Hyperleap is the only in its kind with the high quality data.
  This is because a lot of time and resources are spend to crosscompare, match
  and check the data that enters the database. To achieve this the data is
  inserted in the database in several different steps described in
-Figure~\ref{fig:1.1.1}
+Figure~\ref{informationflow}
  
  \begin{figure}[H]
         \caption{Information flow Hyperleap database}
-       \label{fig:1.1.1}
+       \label{informationflow}
         \centering
         \scalebox{0.7}{
                 \digraph[]{graph111}{
                         rankdir=TB;
-                       node [shape="rectangle",fontsize=10,nodesep=0.5,ranksep=0.75,width=1]
+                       node [shape="rectangle",fontsize=10,nodesep=0.7,ranksep=0.75,width=1]
                         edge [weight=5.]
                         i0 [label="Website"]
                         i1 [label="Email"]
@@ -159,11 +159,10 @@ different crawlers then goes to the \textit{Temporum}.
  \paragraph{Temporum}
  The \textit{Temporum} is a big bin that contains raw data extracted from
  different sources and has to be post processed to be suitable enough for the
-actual database. This processing encompasses several possible tasks.
-
-The first task is to check the validity of the entry. This is a very shallow
-test to check if the crawler is not malfunctioning and there is no nonsense in
-the data. Most of the data is not directly checked for validity, the data is
+actual database. This processing encompasses several possible tasks.  The first
+task is to check the validity of the entry. This is a very shallow test to
+check if the crawler is not malfunctioning and there is no nonsense in the
+data. Most of the data is not directly checked for validity, the data is
  skimmed for strange things but not every datapoint is checked.  The second step
  is matching the entry to several objects. For example the entry has to be
  matched to a certain venue when its source is a ticket vendor who sells tickets
@@ -198,7 +197,7 @@ take days and can be the reason for gaps or faulty information in the database.
         \caption{Feedback loop for malfunctioning crawlers}
         \label{feedbackloop}
         \centering
-       \scalebox{0.8}{
+       \scalebox{0.5}{
                 \digraph[]{graph112}{
                         rankdir=LR;
                         node [shape="rectangle"]
@@ -311,11 +310,13 @@ Figure~\ref{graphexample} is specified as:
         \caption{Example Graph}
         \label{graphexample}
         \centering
-       \digraph[]{graphexample}{
-               rankdir=LR
-               n1 -> n2 [dir="none"]
-               n2 -> n3 [dir="none"]
-               n2 -> n3 [dir="none"]
+       \scalebox{0.7}{
+               \digraph[]{graphexample}{
+                       rankdir=LR
+                       n1 -> n2 [dir="none"]
+                       n2 -> n3 [dir="none"]
+                       n2 -> n3 [dir="none"]
+               }
         }
  \end{figure}
  
@@ -326,14 +327,31 @@ still the edges but the inherent difference is that the edges are ordered
  tuples in stead of not ordered. Adding this property gives the edges a
  direction. Every edge has a specific start and end and are therefore called
  directed edges. A directed graph would look the same as the graph in
-Figure~\ref{graphexample} but then the normal edges would be replaced by
-directional arrows that specifically go from one node to the other.
+Figure~\ref{graphexample} but then visualized with arrows instead of normal
+lines. The arrows specifically go from one node to the other and not the other
+way around. However bidirectional connection can occur. For example graph the
+graph shown in Figure~\ref{dgexample} is directional with a bidirectional
+connection.
+$$G=(\{n1, n2\}, \{(n1, n2), (n2, n1)\}$$
+
+\begin{figure}[H]
+       \caption{Example directed graph}
+       \label{dgexample}
+       \centering
+       \scalebox{0.7}{
+               \digraph[]{dgexample}{
+                       rankdir=LR
+                       n1 -> n2
+                       n2 -> n1
+               }
+       }
+\end{figure}
  
  \paragraph{Directed acyclic graphs}
  Directed Acyclic Graphs(DAGs) are a special kind of directed graphs. DAGs
  are also defined as $G=(V,E)$ but with a restriction on $E$. Namely that cycles
-are not allowed. Figure~\ref{dagexample} shows two graphs. The left graph
-contains a cycle and the right graph does not. Only the right graph is a valid
+are not allowed. Figure~\ref{dagexample} shows two graphs. The bottom graph
+ontains a cycle and the right graph does not. Only the top graph is a valid
  DAG. A cycle is defined by a sequence of edges where nodes are visited more
  then once. Adding the property of non-cyclicity to graphs lowers the
  computational complexity of checking if a node sequence is present in the
@@ -343,14 +361,16 @@ graph to $\mathcal{O}(L)$ where $L$ is the length of the sequence.
         \caption{Example DAG}
         \label{dagexample}
         \centering
-       \digraph[]{dagexample}{
-               rankdir=LR
-               n01 -> n02
-               n02 -> n03
-               n03 -> n01
-               n11 -> n12
-               n12 -> n13
-               n12 -> n14
+       \scalebox{0.7}{
+               \digraph[]{dagexample}{
+                       rankdir=LR
+                       n01 -> n02
+                       n02 -> n03
+                       n03 -> n01
+                       n11 -> n12
+                       n12 -> n13
+                       n12 -> n14
+               }
         }
  \end{figure}
  
@@ -383,16 +403,18 @@ length of the word.
         \caption{Example DAWG}
         \label{exampledawg}
         \centering
-       \digraph[]{graph21}{
-               rankdir=LR;
-               n1,n2,n3,n4,n5 [shape="circle"];
-               n6 [shape="doublecircle"];
-               n1 -> n2 [label="a"];
-               n2 -> n3 [label="b"];
-               n3 -> n6 [label="d"];
-               n1 -> n4 [label="b"];
-               n4 -> n5 [label="a"];
-               n5 -> n6 [label="d"];
-               n5 -> n6 [label="e"];
+       \scalebox{0.7}{
+               \digraph[]{graph21}{
+                       rankdir=LR;
+                       n1,n2,n3,n4,n5 [shape="circle"];
+                       n6 [shape="doublecircle"];
+                       n1 -> n2 [label="a"];
+                       n2 -> n3 [label="b"];
+                       n3 -> n6 [label="d"];
+                       n1 -> n4 [label="b"];
+                       n4 -> n5 [label="a"];
+                       n5 -> n6 [label="d"];
+                       n5 -> n6 [label="e"];
+               }
         }
  \end{figure}
diff --git a/thesis2/2.requirementsanddesign.tex b/thesis2/2.requirementsanddesign.tex

index c3ab987..57af052 100644 (file)
--- a/thesis2/2.requirementsanddesign.tex
+++ b/thesis2/2.requirementsanddesign.tex
@@ -125,30 +125,34 @@ server periodically calls the XML output from the backend to process it.
  
  \subsubsection{Edit/Remove crawler}
  This component lets the user view the crawlers and remove the crawlers from the
-database. Removing the crawler is as simple as selecting it from the dropdown
-list and pressing the remove button. Editing the crawler is done in the same
-fashion but then pressing the edit button. The editing of the crawlers is
-basically the same as adding a new crawler other then that the previous pattern
-is already visible and can be adapted if for example the structure has changed.
+database. Doing one of these things with a crawler is as simple as selecting
+the crawler from the dropdown menu and selecting the operation from the
+other dropdown menu and pressing \textit{Submit}.
+Removing the crawler will remove the crawler completely from the crawler
+database and the crawler will be unrecoverable. Editing the crawler will open a
+similar screen as when adding the crawler. The details about that screen will
+be discussed in ~\ref{addcrawler}. The only difference is that the previous
+trained patterns are already made visible in the training interface and can
+thus be adapted to change the crawler for possible source changes for example.
  
  \subsubsection{Add new crawler}
+\label{addcrawler}
+TODOOO
  \subsubsection{Test crawler}
  
  \subsection{Backend}
  \subsubsection{Program description}
  The backend consists of a main module and a set of libraries all written in
-\textit{Python}\cite{Python}. The main module can,
-and is, be embedded in an apache
-webserver\footnote{\url{https://httpd.apache.org/}} via the
-\textit{mod\_python} apache module\cite{Modpython}. The module
-\textit{mod\_python} allows the webserver to execute Python code in
-the webserver. We chose Python because of the rich set of standard libraries
-and solid cross platform capabilities. We chose Python 2 because it is still
-the default Python version on all major operating systems and stays supported
-until at least the year 2020 meaning that the program can function safe at
-least 5 full years. The application consists of a main Python module that is
-embedded in the webserver. Finally there are some libraries and there is a
-standalone program that does the periodic crawling.
+\textit{Python}\cite{Python}. The main module can, and is, be embedded in an
+apache webserver\cite{apache} via the \textit{mod\_python} apache
+module\cite{Modpython}. The module \textit{mod\_python} allows the webserver to
+execute Python code in the webserver. We chose Python because of the rich set
+of standard libraries and solid cross platform capabilities. We chose Python 2
+because it is still the default Python version on all major operating systems
+and stays supported until at least the year 2020 meaning that the program can
+function safe at least 5 full years. The application consists of a main Python
+module that is embedded in the webserver. Finally there are some libraries and
+there is a standalone program that does the periodic crawling.
  
  \subsubsection{Main module}
  The main module is the program that deals with the requests, controls the
@@ -182,9 +186,15 @@ feed not every feed uses the GUID and therefore it is not reliable to use it.
  The crawler also has a function to export the database to XML format. The XML
  format is specified in an XSD\cite{Xsd} file for minimal ambiguity. 
  
-An XSD file is a file that precisely describes the field the XML file uses.
-As almost every programming language contains an XML library most of the
-languages also contain a XSD library that allows the XML library to parse files
-according to the scheme and in that way the programmer knows exactly what to
-expect as XML fields. The XSD file for this program can be found in the
-appendices in Listing~\ref{scheme.xsd}.
+\subsubsection{XML \& XSD}
+XML is a file format that can describe data structures. XML can be accompanied
+by an XSD file that describes the format. An XSD file is in fact just another
+XML file that describes the format of a class of XML files. Because almost all
+programming languages have an XML parser built in it is a very versatile format
+that makes the importing to the database very easy. The most used languages
+also include XSD validation to detect XML errors, validity and completeness of
+XML files. This makes interfacing with the database and possible future
+programs very easy. The XSD scheme used for this programs output can be found
+in the appendices in Listing~\ref{scheme.xsd}. The XML output can be queried
+via a http interface that calls the crawler backend to crunch the latest
+crawled data into XML.
diff --git a/thesis2/4.discussion.tex b/thesis2/4.discussion.tex

index 7ee3af3..36fb0e8 100644 (file)
--- a/thesis2/4.discussion.tex
+++ b/thesis2/4.discussion.tex
@@ -1,9 +1,10 @@
-\section{Conclusion \& Discussion}
+\section{Conclusion}
+
+\section{Discussion}
  \label{sec:discuss}
  \begin{itemize}
         \item No low level stuff, future research
         \item RSS not that great of a source,
         \item Expand technique to HTML, reuse interface, defining patterns
         \item Combine RSS and HTML
-       \item
  \end{itemize}
diff --git a/thesis2/Makefile b/thesis2/Makefile

index 9877cdb..eede629 100644 (file)
--- a/thesis2/Makefile
+++ b/thesis2/Makefile
@@ -1,5 +1,5 @@
  SHELL:=/bin/bash
-VERSION:=0.3
+VERSION:=0.4
  
  all: thesis
  
@@ -13,6 +13,12 @@ thesis:
         dvipdfm thesis.dvi >> log.txt 2>&1
         mv -v {thesis,mart_thesis_$(VERSION)}.pdf
  
+pack: clean
+       rm -fv version/mart_thesis_$(VERSION).tar{,.gz}
+       tar -cvf version/mart_thesis_$(VERSION).tar *.{tex,xml,png,bib,xsd}
+       gzip -9 version/mart_thesis_$(VERSION).tar
+
+
  clean:
-       rm -vf *.{aux,bbl,blg,dvi,log,out,pdf,toc,dot,ps} log.txt scheme[12].xsd
+       rm -vf *.{aux,bbl,blg,dvi,log,out,pdf,toc,dot,ps,pyg} log.txt scheme[12].xsd
  
diff --git a/thesis2/exrss.xml b/thesis2/exrss.xml

index 3e57bc9..150318e 100644 (file)
--- a/thesis2/exrss.xml
+++ b/thesis2/exrss.xml
@@ -22,3 +22,6 @@
                         </item>
                         <item>
                                 <title>zaterdag 21 maart 2015 20:00 - ...
+
+
+                       ...
diff --git a/thesis2/thesis.bib b/thesis2/thesis.bib

index 126f710..da073f5 100644 (file)
--- a/thesis2/thesis.bib
+++ b/thesis2/thesis.bib
@@ -155,10 +155,20 @@ title = {{Constructing minimal acyclic deterministic finite automata}}
                 http://www. w3. org/TR/1998/REC-xml-19980210},
         year={1998}
  }
-@article{adler2001extensible,
+@article{Xsd,
         title={Extensible Stylesheet Language (XSL)-Version 1.0},
         author={Adler, Sharon and Milowski, Alex and Richman, Jeremy and Zilles,
                 Steve and others},
         year={2001},
         publisher={Citeseer}
  }
+@article{apache,
+       title={The Apache HTTP server project},
+       author={Fielding, Roy T and Kaiser, Gail},
+       journal={Internet Computing, IEEE},
+       volume={1},
+       number={4},
+       pages={88--90},
+       year={1997},
+       publisher={IEEE}
+}
diff --git a/thesis2/thesis.tex b/thesis2/thesis.tex

index 525b161..0893fcf 100644 (file)
--- a/thesis2/thesis.tex
+++ b/thesis2/thesis.tex
@@ -81,7 +81,7 @@ in transforming raw data to structured data}
  \chapter{Algorithm}
  \input{3.methods.tex}
  
-\section{Discussion}
+\chapter{Conclusion \& Discussion}
  \input{4.discussion.tex}
  
  \chapter{Appendices}
diff --git a/thesis2/version/mart_thesis_0.4.tar.gz b/thesis2/version/mart_thesis_0.4.tar.gz

new file mode 100644 (file)

index 0000000..0dfecac

Binary files /dev/null and b/thesis2/version/mart_thesis_0.4.tar.gz differ
author	Mart Lubbers <mart@martlubbers.net>
	Tue, 2 Dec 2014 13:35:10 +0000 (14:35 +0100)
committer	Mart Lubbers <mart@martlubbers.net>
	Tue, 2 Dec 2014 13:35:10 +0000 (14:35 +0100)
thesis2/1.introduction.tex		patch \| blob \| history
thesis2/2.requirementsanddesign.tex		patch \| blob \| history
thesis2/4.discussion.tex		patch \| blob \| history
thesis2/Makefile		patch \| blob \| history
thesis2/exrss.xml		patch \| blob \| history
thesis2/thesis.bib		patch \| blob \| history
thesis2/thesis.tex		patch \| blob \| history
thesis2/version/mart_thesis_0.4.tar.gz	[new file with mode: 0644]	patch \| blob