X-Git-Url: https://projects.mako.cc/source/state_of_wikimedia_research_2015/blobdiff_plain/10463fabeab4400b02b1cfa7bf06aea96214e621..7cd2a85d1c8bc2d5189e0ad63b44756dc3b24b3d:/20150717-wikimania_research.tex?ds=sidebyside diff --git a/20150717-wikimania_research.tex b/20150717-wikimania_research.tex index 854624e..8150bad 100644 --- a/20150717-wikimania_research.tex +++ b/20150717-wikimania_research.tex @@ -337,12 +337,55 @@ \end{frame} +\begin{frame} + \frametitle{How to measure the global influence of languages?} + + \larger \larger + + \e{Traditional} methods rely on: + + \begin{itemize} + \larger \larger + \item \e{Population} of speakers + \item \e{Income} or political power of speakers + \end{itemize} + + Paper presents \e{new network method} based on measuring + \e{co-speakers} of languages in several data sources including + Wikipedia. + +\end{frame} + \begin{frame} \frametitle{Wikipedia as a source of data: Ronen et al.} \includegraphics[width=\textwidth]{figures/ronen_fig1.png} + + \note{Two languages are connected when users that edit an article in + one Wikipedia language edition are significantly more likely to + also edit an article in the edition of the other language. + + If an editor of Spanish is also likely to edit Galician, we'll + call those languages connected.} +\end{frame} + +\begin{frame} + \frametitle{Wikipedia as a source of data: Ronen et al.} + + \includegraphics[width=\textwidth]{figures/ronen_people.png} + + \note{\begin{itemize} + \item The number of people per language (born 1800–1950) with + articles in at least 26 Wikipedia language editions as a + function of their language’s eigenvector centrality. + \item The bottom row shows the number of people per language (born + 1800–1950) listed in \emph{Human Accomplishment} (a book by + Charles Murray) as a function of their language’s eigenvector + centrality. + \end{itemize}} \end{frame} + \subsection{Community and Organization} \begin{frame} @@ -366,13 +409,29 @@ \begin{frame} \frametitle{Community and organization: Warncke-Wang et al.} + + \larger \larger + \e{Perfect Alignment Hypothesis (PAH)}: There is an exact match + between the supply of high-quality content and the demand for it. + + \bigskip \includegraphics[width=\textwidth]{figures/warncke-english_confusion.pdf} + + \note{\e{Quality}: Stub, Start, C, B, Good Article, A, Featured Article + + \e{Popularity}: equivalently sized buckets} \end{frame} \begin{frame} \frametitle{Community and organization: Warncke-Wang et al.} + Measure of the degree of misalignment can be used to build lists of + categories that are relatively \e{``overproduced''} and + \e{``underproduced''}: + + \bigskip + \includegraphics[width=\textwidth]{figures/warncke-english_overunder.pdf} \end{frame} @@ -489,20 +548,97 @@ -% \begin{frame} -% \frametitle{Wikipedia Viewership and Flu Prediction: Results} +\begin{frame} + \centertext{6em}{Automation in Wikipedia} + + \note{Tilman + + Starting to see more practical applications of AI methods to editing. + + Bots have been writing Wikipedia articles ever since back in 2002, + User:Rambot covered US municipalities from US census data. + + Picked these two related papers for their somewhat unusual approach} +\end{frame} + + +\begin{frame} + \frametitle{Automation in Wikipedia} + + \larger \larger + Banerjee et al., \e{Playscript Classification and Automatic Wikipedia + Play Articles Generation}. + 2014 22nd International Conference on Pattern Recognition (ICPR). + pp. 3630–3635. + \href{http://dx.doi.org/10.1109/ICPR.2014.624} + {DOI:10.1109/ICPR.2014.624} + \href{http://www.cse.unt.edu/~ccaragea/papers/icpr14.pdf}{Author's copy} + +\end{frame} + + +\begin{frame} + +\frametitle{Automation in Wikipedia: Bot-written theatre play articles} + + \begin{itemize} + \larger \larger \larger + \item Bot searches for playscripts and related documents on the web + \bigskip + \item Extract key information from them, e.g. + \begin{itemize} \larger + \item The play's main characters + \item Relevant sentences from online synopses of the play + \item Mentions in Google Books and Google News (as evidence that + the play satisfies Wikipedia's notability criteria) + + \end{itemize} + + \item Some heuristics to exclude non-encyclopedic sentences, e.g. + first person statements + + \end{itemize} + + \note{Tilman + + NB: Most article creation bots work from well-defined databases + (e.g. species, census data, geographical databases). + + This bots finds article topics and online references itself, + using an elaborate classifier algorithm to distinguish scripts + from non-scripts.} +\end{frame} + +\begin{frame} +\frametitle{Automation in Wikipedia: Bot-written theatre play articles} -% \centering -% \includegraphics[width=\textwidth]{figures/flu.png} + \includegraphics[width=0.3\textwidth]{figures/Fourteen_submission.png} + \begin{itemize} + \larger \larger \larger + \item 15 articles submitted at Articles for Creation. Two accepted + by Wikipedia editors. One of them without major changes. + + + \end{itemize} -% \note{\begin{itemize} -% \larger \larger + \note{Tilman + + Editors were unaware the articles had been automatically generated. + + + Related paper by some of the same authors: + + Banerjee et al., \e{WikiKreator: Improving Wikipedia Stubs + Automatically}. \href{https://siddbanpsu.github.io/publications/acl2015-banerjee-preprint.pdf} + {preprint}, accepted paper at ACL2015 + + Elaborate classifier method to find suitable web resources for + expanding stubs - but copying sentences wholesale from these into + articles landed the bot (User:MightyPepper) in a \href{https://en.wikipedia.org/wiki/Wikipedia:Contributor_copyright_investigations/Archive#2015}{contributor copyright investigation}\ldots + } +\end{frame} -% \item Wikipedia better than Google at predicting peak flu weeks. -% \item Wikipedia better at predicting relative influenza rates. -% \end{itemize}} -% \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @@ -578,7 +714,7 @@ AAAI Conference on Web and Social Media (ICWSM). \item 2: Articles about women tend to be less centrally connected in the network of articles than articles about men (Smurfette!) \item 3: (\e{viz}) Content of articles about women uses different words - than those about men. Much igher incidence of language related to + than those about men. Much higher incidence of language related to family, gender, and relationships. \end{itemize} } @@ -728,3 +864,12 @@ AAAI Conference on Web and Social Media (ICWSM). \end{document} + +% LocalWords: xshift yshift makopurple Tilman wikipedia Scopus Hu +% LocalWords: Ronen Gonçalves Vespignani Hidalgo al Galician Ranjan +% LocalWords: eigenvector Warncke Terveen Hecht underproduced NEJM +% LocalWords: Hwang Engl doi Kräenbring WebMD WikiProject Mohsen +% LocalWords: Jadidi Markus Strohmaier Wikipedias WPs Smurfette +% LocalWords: Barnhisel Rapchak WikiEd Mesgari Mostafa Okoli Chitu +% LocalWords: Mehdi Mohamad Årup Lanamäki Arto Miquel Ribé OpenSym +% LocalWords: WikiPapers