From 0e088fc9b0523193a6a2736e9d31f2ced0850269 Mon Sep 17 00:00:00 2001 From: Mart Lubbers Date: Mon, 11 May 2015 13:36:16 +0200 Subject: [PATCH] fixed up to but not including 1.3 --- thesis2/.gitignore | 1 + thesis2/1.introduction.tex | 212 +++++++++++++++++--------------- thesis2/Makefile | 2 +- thesis2/abstract.tex | 27 ++-- thesis2/img/informationflow.dot | 54 ++++---- 5 files changed, 159 insertions(+), 137 deletions(-) diff --git a/thesis2/.gitignore b/thesis2/.gitignore index 4b8e74e..65a3636 100644 --- a/thesis2/.gitignore +++ b/thesis2/.gitignore @@ -10,6 +10,7 @@ *.ps *.eps *.pyg +*.fmt scheme[12].xsd log.txt _minted* diff --git a/thesis2/1.introduction.tex b/thesis2/1.introduction.tex index f87a63d..b15167b 100644 --- a/thesis2/1.introduction.tex +++ b/thesis2/1.introduction.tex @@ -2,61 +2,63 @@ What do people do when they want to grab a movie? Attend a concert? Find out which theater shows play in local theater? -When the internet was in its early days and it started to be accessible to most -of the people information about entertainment was still obtained almost -exclusively from flyers, books, posters, radio/TV advertisements. People had to -look hard for information and you could easily miss a show just because you did -not cross paths with it. -Today the internet is used by almost everyone in the western society on a daily -basis and we would think that missing an event would be impossible because of -the enormous loads of information you can receive every day using the internet. -The opposite is true for information about leisure activities. +In the early days of the internet, access to the web was not available for most +of the people. Information about leisure activities was almost exclusively +obtained from flyers, posters and other written media and radio/TV +advertisements. People had to put effort in searching for information and it +was easy to miss a show just because you did not cross paths with it. Today the +internet is used on a daily basis by almost everyone in the western society and +one would think that missing an event would be impossible because of the +enormous loads of information you can receive every day using the internet. +For leisure activities the opposite is true, complete and reliable information +about events is still hard to find. Nowadays information on the internet about entertainment is offered via two main channels: individual venues websites and information bundling websites. Individual venues put a lot of effort and resources in building a beautiful, fast and most of all modern website that bundles their information with nice -graphics, animations and gimmicks. There also exist companies that bundle the -information from different websites to provide an overview. Information -bundling websites often have multiple individual venue websites as the source -for their information and therefore the information is most of the time not -complete. This is because individual venues tend to think, for example, that it -is obvious what the address of their venue is, that their ticket price is -always fixed to \EURdig$5.-$ and that you need a membership to attend the -events. Individual organizations usually put this non specific information in a -disclaimer or a separate page and information bundling website miss out on -these types of information a lot. They miss out because they can crawl these -individual events but gathering the miscellaneous information is usually done -by hand. - -Combining the information from the different data source turns out to be a hard -task for such information bundling websites. It is a hard task because -information bundling websites do not have the resources and time reserved for -these tasks and therefore often serve incomplete information. Because of -the complexity of getting complete information there are not many websites -trying to bundle entertainment information into a complete and consistent -database. Hyperleap\footnote{\url{http://hyperleap.nl}} tries to achieve goal -of serving complete and consistent information. +graphics, animations and other gimmicks. Information bundling websites are run +by companies that try to provide an overview of multiple venues. Information +bundling websites often have individual venue websites as their source for +information. Individual venues assume, for example, that it is obvious what the +address of their venue is, that their ticket price is always fixed to +\EURdig$5.-$ and that you need a membership to attend the events. Individual +organizations usually put this non specific information in a disclaimer or a +separate page. Because of the less structured way of providing information the +information bundling websites have a hard time finding complete information. +The event data can be crawled using automated crawlers but the miscellaneous +information usually has to be crawled by hand. +Combining the information from the different data source turns out to be a +complicated task for information bundling websites. The task is difficult +because the companies behind these information bundling websites do not have +the resources and time reserved for these tasks and therefore often serve +incomplete information. Because of the complexity of getting complete +information there are not many companies trying to bundle entertainment +information into a complete and consistent database and website. +Hyperleap\footnote{\url{http://hyperleap.nl}} tries to achieve goal of serving +complete and consistent information and offers it via various information +bundling websites. +\newpage \section{Hyperleap \& Infotainment} -Hyperleap is an internet company that existed in the time that internet was not -widespread. Hyperleap, active since 1995, is specialized in producing, +Hyperleap is an internet company that was founded in the time that internet was +not widespread. Hyperleap, active since 1995, is specialized in producing, publishing and maintaining \textit{infotainment}. \textit{Infotainment} is a combination of the words \textit{information} and \textit{entertainment}. It -represents a combination of factual information and subjectual information -(entertainment) within a certain category or field. In the case of Hyperleap -the category is the leisure industry, leisure industry encompasses all facets -of entertainment going from cinemas, theaters, concerts to swimming pools, -bridge competitions and conferences. Within the entertainment industry factual -information includes, but is not limited to, information such as starting time, -location, host or venue and duration. Subjectual information includes, but is -not limited to, information such as reviews, previews, photos, background -information and trivia. - -Hyperleap manages the largest database containing \textit{infotainment} about -the leisure industry focussed on the Netherlands and surrounding regions. The -database contains over $10.000$ categorized events on average per week and +represents a combination of factual information, the \textit{information} part, +and subjectual information, the \textit{entertainment} part, within a certain +category or field. In the case of Hyperleap the category is the leisure +industry, leisure industry encompasses all facets of entertainment ranging from +cinemas, theaters, concerts to swimming pools, bridge competitions and +conferences. Within the entertainment industry factual information includes, +but is not limited to, starting time, location, host or venue and duration. +Subjectual information includes, but is not limited to, reviews, previews, +photos, background information and trivia. + +Hyperleap says to manage the largest database containing \textit{infotainment} +about the leisure industry focussed on the Netherlands and surrounding regions. +The database contains over $10.000$ categorized events on average per week and their venue database contains over $54.000$ venues delivering the leisure activities ranging from theaters and music venues to petting zoos and fast food restaurants. All the subjectual information is obtained or created by Hyperleap @@ -64,13 +66,13 @@ and all factual information is gathered from different sources, quality checked and therefore very reliable. Hyperleap is the only company in its kind that has such high quality information. The \textit{infotainment} is presented via several websites specialized per genre or category and some sites attract over -$500.000$ visitors per month. +$500.000$ visitors each month. \section{Information flow} -The reason why Hyperleap is the only in its kind with the high quality data is -because Hyperleap spends a lot of time and resources on quality checking, cross -comparing and consistency checking before the data enters the database. To -achieve this, the data will go through several different stages before it +Hyperleap can keep up the high quality data by investing a lot of time and +resources in quality checking, cross comparing and consistency checking. By +doing so the chances of incomplete or wrong data are much lower. +To achieve this, the data will go through several different stages before it enters the database. These stages are visualized in Figure~\ref{informationflow} as an information flow diagram. In this diagram the nodes are processing steps and the arrows denote information transfer or @@ -79,83 +81,89 @@ flow. \begin{figure}[H] \label{informationflow} \centering - \includegraphics[scale=0.7]{informationflow.pdf} - \strut\\ - \strut\\ + \includegraphics[width=\linewidth]{informationflow.pdf} \caption{Information flow Hyperleap database} \end{figure} -\newpage -\subsection*{Sources} -The information that enters the database has to be quality checked and this -checking starts at the source of the data. There are several criteria the -information from the source have to comply to before it can enter the -database. The prerequisites for a source are for example the fact that the -source has to be reliable, consistent and free by licence. Event information -from a source must have at least the following fields of information.\\ -\textbf{What:}\\ -The \textit{What} field is the field that describes the content, content is a -very broad definition. In practice it can be describing the concert tour name, -theater show title, movie title, festival title and many more.\\ -\textbf{Where:}\\ -The \textit{Where} field is the location of the event. The location is often -omitted because the organization behind source presenting the information think -it is obvious. This field can also include different sub locations. For -example when a pop concert venue has their own building but in the summer they -organize a festival in some park. This data is often assumed to be trivial and -inherent but in practice this is not the case. In this example for an outsider -only the name of the park is often not enough.\\ -\textbf{When:}\\ -The \textit{When} field is the time and date of the event. Hyperleap wants -to have at minimum the date, start time and end time. In the field end -times are often omitted because they are not fixed or the organization -think it is obvious.\\ +\subsection{Sources} +A source is a service, location or medium in which information about events is +stored or published. A source can have different source shapes such as HTML, +email, flyer, RSS and so on. All information gathered from a source has to be +quality checked before it is even considered for automated crawling. There are +several criteria information from the source has to comply to before an +automated crawler can be made. The prerequisites for a source are for example +the fact that the source has to be reliable, consistent and free by licence. +Event information from a source must have at least the \textit{What, Where} and +\textit{When} information. + +The \textit{What} information is the information that describes the content, +content is a very broad definition but in practice it can be describing the +concert tour name, theater show title, movie title, festival title and many +more. + +The \textit{Where} information is the location of the event. The location is +often omitted because the organization behind source presenting the information +thinks it is obvious. This information can also include different sub +locations. For example when a pop concert venue has their own building but in +the summer they organize a festival in some park. This data is often assumed to +be trivial and inherent but in practice this is not the case. In this example +for an outsider only the name of the park is often not enough. + +The \textit{When} field is the time and date of the event. Hyperleap wants to +have at minimum the date, start time and end time. In the field end times for +example are often omitted because they are not fixed or the organization think +it is obvious. \strut\\ + Venues often present incomplete data entries that do not comply to the requirements explained before. Within the information flow categorizing and grading the source is the first step. Hyperleap processes different sources and source types and every source has different characteristics. Sources can be modern sources like websites or social media but even today a lot of -information arrives at Hyperleap via flyers, fax or email. As sources vary in -content structure sources also vary in reliability. For example the following -entry is an example of a very well structured and probably generated, and thus -probably also reliable, event description. The entry can originate for example -from the title of an entry in a RSS feed. The example has a clear structure and -almost all information required is available directly from the entry. +information arrives at Hyperleap via flyers, fax or email. As source types vary +in content structure sources also vary in reliability. For example the +following entry is an example of a very well structured and probably generated, +and thus probably also reliable, event description. The entry can originate for +example from the title of an entry in a RSS feed. The example has a clear +structure and almost all information required is available directly from the +entry. \begin{flushleft} \texttt{2015-05-20, 18:00-23:00 - \textit{Foobar} presenting their % new CD in combination with a show. Location: small salon.} \end{flushleft} -An example of a terrible item could be for example the following text that +An example of a low quality item could be for example the following text that could originate from a flyer or social media post. This example lacks a precise date, time and location and is therefore hard for people to grasp at first, let alone for a machine. When someone wants to get the full information he has to tap in different resources which might not always be available. \begin{flushleft} - \texttt{\textit{Foobar} playing to celebrate their CD release in the% + \texttt{\textit{Foobar} playing to celebrate their CD release in the % park tomorrow evening.} \end{flushleft} -\subsection*{Crawler} +Information with such a low quality is often not suitable for automated +crawling. In Figure~\ref{informationflow} this manual route is shown by the +arrow going straight from the source to the database. Non digital source types +or very sudden changes such as surprise concerts or cancellations are also +manually crawled. + +\subsection{Crawler} When the source has been determined and classified the next step is -periodically crawling the source. At the moment the crawling happens using two -main methods.\\ -\textbf{Manual crawling:}\\ -Manual crawling is basically letting an employee access the source and put the -information directly in the database. This often happens with non digital -sources and with very sudden events or event changes such as surprise concerts -or event cancellation.\\ -\textbf{Automatic crawling:}\\ -Some sites are very structured and a programmer can create a program that can -visit the website systematically and automatically to extract all the new -information. Not all digital sources are suitable to be crawled automatically -and will still need manual crawling. The programmed crawlers are always -specifically created for one or a couple sources and when the source changes -for example structure the programmer has to adapt the crawler which is costly. -Information from the all the crawlers goes first to the \textit{Temporum}. +periodically crawling the source using an automated crawler. As said before +sources have to be structured and reliable, when this is the case a programmer +will create a program that will visit the website systematically and +automatically to extract all the new information. The programmed crawlers are +usually specifically created for one or more sources and when the source +changes, the programmer has to adapt the crawler. Such a change is usually a +change in structure. Since writing and adapting requires a programmer the +process is costly. Automatically crawled information is not inserted into the +database directly because the information is not reliable enough. In case of a +change in the source malformed data can pass through. As a safety net and final +check the information first goes to the \textit{Temporum} before it will be +entered in the database. \subsection*{Temporum} The \textit{Temporum} is a big bin that contains raw data extracted from diff --git a/thesis2/Makefile b/thesis2/Makefile index ba6cf1a..34a32f5 100644 --- a/thesis2/Makefile +++ b/thesis2/Makefile @@ -10,7 +10,7 @@ GRAPHS:=$(addsuffix .pdf,$(basename $(shell ls img/*.{dot,png}))) .SECONDARY: $(addsuffix .fmt,$(basename $(OUTPUT))) .PHONY: clobber graphs -all: graphs thesis.pdf +all: thesis.pdf graphs %.pdf: %.png convert $< $@ diff --git a/thesis2/abstract.tex b/thesis2/abstract.tex index 05f5f49..2cc2d04 100644 --- a/thesis2/abstract.tex +++ b/thesis2/abstract.tex @@ -1,12 +1,15 @@ -Within the leisure activity field, information is often bundled badly and -contains empty or wrong data. Hyperleap tries to solve this problem by bundling -the information from various sources including RSS feeds. Currently the -feedback loop for fixing site-specific crawlers requires multiple steps of which -multiple steps demand someone with a computer science background to perform. We -introduce a new adaptable crawler generation system using substring matching via -an adapted form of directed acyclic word graphs. The application allows users -with no particular computer science background to create, edit and test -crawlers for RSS feeds. In this way the feedback loop for broken crawlers is -shortened, new sources can be incorporated in the database quicker and, most -importantly, the information about the latest movie show, theater production or -conference will reach the people looking for it as fast as possible. +When looking for an activity in a bar or trying to find a good movie to watch +it often seems difficult to find complete information about the event without +empty or wrong data. Hyperleap tries to solve problem of bad information +giving by bundling the information from various sources and invest in good +quality checking. Currently information retrievel is performed using +site-specific crawlers, when a crawler breaks the feedback loop for fixing the +it contains of different steps and requires someone with a computer science +background. A crawler generation system has been created that uses directed +acyclic word graphs that assist solving the feedback loop problem. The system +allows users with no particular computer science background to create, edit and +test crawlers for \textit{RSS} feeds. In this way the feedback loop for broken +crawlers is shortened, new sources can be incorporated in the database quicker +and, most importantly, the information about the latest movie show, theater +production or conference will reach the people looking for it as fast as +possible. diff --git a/thesis2/img/informationflow.dot b/thesis2/img/informationflow.dot index ef3cccd..9755125 100644 --- a/thesis2/img/informationflow.dot +++ b/thesis2/img/informationflow.dot @@ -1,25 +1,35 @@ digraph{ - rankdir=TB; + //rankdir=TB; + rankdir=LR; + graph [compound=true]; node [shape="rectangle",fontsize=10,nodesep=0.7,ranksep=0.75,width=1]; - edge [weight=5.]; - i0 [label="Website"]; - i1 [label="Email"]; - i2 [label="Fax"]; - i3 [label="RSS/Atom"]; - p1 [label="Preproccessing"]; - p2 [label="Temporum: Postproccesing"]; - o1 [label="Database: Insertion"]; - o2 [label="TheAgenda"]; - o3 [label="BiosAgenda"]; - o4 [label="..."]; - node [width=5]; p1 p2 o1; - i0 -> p1; - i1 -> p1; - i2 -> p1; - i3 -> p1; - p1 -> p2; - p2 -> o1; - o1 -> o2; - o1 -> o3; - o1 -> o4; + edge [weight=10]; + + subgraph cluster_0 { + node [shape="rectangle",fontsize=10,nodesep=0.7,ranksep=0.75,width=1]; + edge [weight=5.]; + i0 [label="Website"]; + i1 [label="Email"]; + i2 [label="RSS/Atom"]; + i4 [label="..."]; + label="Sources"; + } + + c1 [label="Crawling"]; + t2 [label="Temporum"]; + d1 [label="Database"]; + + subgraph cluster_1 { + node [shape="rectangle",fontsize=10,nodesep=0.7,ranksep=0.75,width=1]; + edge [weight=5.]; + o1 [label="..."]; + o2 [label="BiosAgenda"]; + o3 [label="TheAgenda"]; + label="Publication"; + } + i2 -> c1 [ltail=cluster_0]; + i0 -> d1 [ltail=cluster_0]; + c1 -> t2; + t2 -> d1; + d1 -> o2 [lhead=cluster_1]; } -- 2.20.1