PROJECTS = [
    {projectName: 'Intro',
     data: {
       title: 'Introduction',
       people: [],
       image: '',
       links: {
         data: [],
         paper: [],
         demo: [],
         title: [],
         other: []},
       narrative:'The CLAIR (Computational Linguistics And Information Retrieval) research group focuses on text analysis, natural language processing, information retrieval, and network analysis. Specific projects involve text summarization, question answering, topic modeling, and bibliometrics. The applications areas include bioinformatics, political science, social media analysis, and others.',
      }
    },
    {projectName: 'Clairlib',
     data: {
       title: 'clairlib: the meta-project',
       people: [
         'Dragomir Radev',
         'Mark Hodges',
         'Anthony Fader',
         'Mark Joseph',
         'Joshua Gerrish',
         'Mark Schaller',
         'Jonathan dePetri',
         'Bryan Gibson',
         'Chen Huang'],
       image: 'research_files/clairlib-330.png',
       links: {
         data: [
           {'text': 'Project Website',
            'href': 'http://www.clairlib.org'},
           {'text': 'Alternate Project Website',
            'href': 'http://belobog.si.umich.edu/clair/clairlib'}
         ],
         paper: [],
         demo: [],
         title: [],
         other: []},
       narrative:'Clairlib is a suite of open-source Perl modules developed and maintained by the Computational Linguistics And Information Retrieval (CLAIR) group at the University of Michigan. Clairlib is intended to simplify a number of generic tasks in natural language processing (NLP), information retrieval (IR), and network analysis (NA). The latest version of clairlib is 1.06 which was released on March 2009 and includes about 130 modules implementing a wide range of functionalities. <br /><br /> Clairlib is distributed in two forms: Clairlib-core, which has essential functionality and minimal dependence on external software, and Clairlib-ext, which has extended functionality that may be of interest to a smaller audience. Much can be done using Clairlib on its own. Some of the things that Clairlib can do are: Tokenization, Summarization, Document Clustering, Document Indexing, Web Graph Analysis, Network Generation,  Power Law Distribution Analysis, Network Analysis, RandomWalks on Graphs, Tf-IDF, Perceptron Learning and Classification,  and Phrase Based Retrieval and Fuzzy OR Queries. <br /><br />Clairlib modules are available for download on www.clairlib.org . Installation instructions and modules documentation is also available in both PDF and HTML formats. Clairlib comes with a lot of code examples and a set of useful tutorials on using its modules in various applications.<br /><br />This work has been supported in part by National Institutes of Health grants R01 LM008106 "Representing and Acquiring Knowledge of Genome Regulation" and U54 DA021519 "National center for integrative bioinformatics", as well as by grants IDM 0329043 "Probabilistic and link-based Methods for Exploiting Very Large Textual Repositories", DHB 0527513 "The Dynamics of Political Representation and Political Rhetoric", 0534323 "Collaborative Research: BlogoCenter - Infrastructure for Collecting,Mining and Accessing Blogs", and 0527513 "The Dynamics of Political Representation and Political Rhetoric", from the National Science Foundation.'
      }
    },
    {projectName: 'PoliText',
      data: {
        title: 'Analyzing Political Speech',
        people: [],
        image: 'research_files/poli_sci-330.png',
        links: {
          data: [],
          paper: [],
          demo: [],
          title: [],
          other: [
            {'text': 'Topic Identification',
             'href': 'http://belobog.si.umich.edu/clair/clair/poliscitopics.html'},
            {'text': 'Identifying Central Speakers',
             'href': 'http://belobog.si.umich.edu/clair/clair/poliscispeakers.html'}
          ]
        },
        narrative:'We introduce a technique for identifying the most salient participants in a discussion. Our method, MavenRank, is based on lexical centrality: a random walk is performed on a graph in which each node is a participant in the discussion and an edge links two participants who use similar rhetoric. As a test, we used MavenRank to identify the most influential members of the US Senate using data from the US Congressional Record and used committee ranking to evaluate the output. Our results show that MavenRank scores are largely driven by committee status in most topics, but can capture speaker centrality in topics where speeches are used to indicate ideological position instead of influence legislation. We are currently working on a dynamic extension of MavenRank that identifies influential speakers at a given time. '
      }
    },
    {projectName: 'DynamicSalience',
      data: {
        title: 'Tracking the Dynamic Evolution of Participant Salience in a Discussion ',
        people: [
          'Ahmed Hassan',
          'Dragomir Radev'],
        image: 'research_files/polisci-small-330.png',
        links: {
          data: [],
          paper: [],
          demo: [],
          title: [],
          other: []},
        narrative:'In this work, we introduce a technique for analyzing the temporal evolution of the salience of participants in a discussion. Our method can dynamically track how the relative importance of speakers evolve over time using graph based techniques. Speaker salience is computed based on the eigenvector centrality in a graph representation of participants in a discussion. Two participants in a discussion are linked with an edge if they use similar rhetoric. The method is dynamic in the sense that the graph evolves over time to capture the evolution inherent to the participants salience. We used our method to track the salience of members of the US Senate using data from the US Congressional Record. Our analysis investigated how the salience of speakers changes over time. Our results show that the scores can capture speaker centrality in topics as well as events that result in change of salience or influence among different participants. '
      }
    },
    {projectName: 'Facets',
      data: {
        title: ' Detecting Multiple Facets of an Event Using Graph-Based Unsupervised Methods',
        people: [
          'Pradeep Muthukrishnan',
          'Joshua Gerrish',
          'Dragomir Radev'],
        image: 'research_files/vtech_topics-330.png',
        links: {
          data: [],
          paper: [],
          demo: [],
          title: [],
          other: []},
        narrative:'We propose a new unsupervised method to extract different facets about news events from blog postings. The method is a two step process with the first step generating different candidate facets using Kullback-Leibler divergence and the second step focuses on selecting a set of facets which cover the entire space of documents while maximizing the diversity of the facets themselves. The second step is formulated as a dynamic weighted set cover problem and solved using a greedy algorithm. '
      }
    },
    {projectName: 'SSkNN',
      data: {
        title: 'Improved Nearest Neighbor Methods For Text Classification With Language Modeling and Harmonic Functions ',
        people: [
          'Gunes Erkan',
          'Ahmed Hassan',
          'Dragomir Radev'],
        image: 'research_files/ssknn-330.png',
        links: {
          data: [],
          paper: [],
          demo: [],
          title: [],
          other: []},
        narrative:'In this project, we presented new nearest neighbor methods for text classification and an evaluation of these methods against the existing nearest neighbor methods as well as other well-known text classification algorithms. Inspired by the language modeling approach to information retrieval, we show improvements in k-nearest neighbor (kNN) classification by replacing the classical cosine similarity with a KL divergence based similarity measure. We also present an extension of kNN to the semi-supervised case which turns out to be a formulation that is equivalent to semi-supervised learning with harmonic functions. In both supervised and semi-supervised experiments, our algorithms surpass the state-of-the-art methods such as Support Vector Machines (SVM) and transductive SVM on the Reuters Corpus Volume I (RCV1), and the 20 Newsgroups dataset and produce competitive results on the Reuters-21578 dataset. To our knowledge, this paper presents the most comprehensive evaluation of different machine learning algorithms on the entire RCV1 dataset. '
      }
    },
    {projectName: 'GIN',
      data: {
        title: 'Gene Interaction Network',
        people: [
          'Arzucan Ozgur',
          'Thuy Vu',
          'Gunes Erkan',
          'Anthony Fader',
          'Joshua Gerrish',
          'Mark Schaller',
          'Dragomir Radev'],
        image: 'research_files/gin-330.png',
        links: {
          data: [],
          paper: [],
          demo: [
            {'text': 'Demo Site',
             'href': 'http://belobog.si.umich.edu:8080/gin/'}
          ],
          title: [],
          other: []
        },
        narrative:'GIN (Gene Interaction Network) is a system for browsing articles and molecule interaction information. What makes GIN stand out from other similar systems is that it uses automated methods (such as dependency parsing) to mine the text for relevant information (such as protein interactions) and computes statistics for the interaction network. The user can browse articles with highlighted summary sentences, citing sentences (sentences from other articles that cite the article in question), and interaction sentences. The user can also browse molecules to view their interactions, neighborhood, and other network statistics. '
      }
    },
    {projectName: 'GIN-NA',
      data: {
        title: 'GIN-NA: Gene Interaction Network Analysis',
        people: [
          'Arzucan Ozgur',
          'Dragomir Radev'],
        image: '',
        links: {
          data: [],
          paper: [],
          demo: [],
          title: [],
          other: []},
        narrative: 'GIN-NA is a system for analysing molecule interaction networks. The interaction networks are retrieved from the MiMI database, which integrates protein interactions from diverse biological data sources. Analysis of two types of networks are performed, namely molecule-specific networks and disease-specific networks. Molecule-specific networks are the networks of interactions in the neighborhood of a molecule. Besides the general network statistics such as average degree, power-law degree distribution, clustering coefficient, and shortest path statistics, GIN-NA ranks the molecules in the network based on graph centrality measures and second neighbor statistics. Disease-specific networks are built by compiling lists of known disease genes and retrieving the interactions among these genes and their neighborhood. We hypothesize that the genes central in the disease-specific gene interaction network are likely to be related to the disease and rank the genes based on their centrality scores. Currently, GIN-NA provides disease-specific networks for the four Driving Biological Problems, Prostate Cancer, Type 1 Diabetes, Type 2 Diabetes, and Bipolar Disorder.'
      }
    },
    {projectName: 'GIN-IE',
      data: {
        title: 'GIN-IE: Gene Interaction Extraction from the Literature',
        people: [
          'Arzucan Ozgur',
          'Gunes Erkan',
          'Dragomir Radev'],
        image: 'research_files/prot_int-330.png',
        links: {
          data: [],
          paper: [],
          demo: [
            {'text': 'Demo Site',
             'href': 'http://belobog.si.umich.edu/clair/clair/protinter.html'}
          ],
          title: [],
          other: []},
        narrative:'We present our approach of using dependency parsing and machine learning techniques to identify interacting protein pairs from full text articles and extracting the most relevant sentences that describe their interaction. We extract features from the dependency parse trees of the sentences and use these features to train an SVM classifier to identify and rank sentences that describe an interaction. Dependency parse trees not only capture sentence syntax but also some of its semantics such as predicate-argument relationships. We also present the improved version of our system, where we extract paths between a protein pair in the dependency parse tree of a sentence and define two kernel functions for SVM based on the cosine and edit distance based similarities among these paths. '
      }
    },
    {projectName: 'BioEvents',
      data: {
        title: 'Extracting Biomedical Events from the Literature',
        people: [
          'Arzucan Ozgur',
          'Dragomir Radev'],
        image: '',
        links: {
          data: [],
          paper: [],
          demo: [],
          title: [],
          other: []},
        narrative: 'Most previous work on biomedical information extraction focuses on identifying relationships among biomedical entities (e.g. protein-protein interactions). Unlike relationships, which are in general characterized with a pair of entities, events can be characterized with event types and multiple entities in varying roles. The BioNLP\'09 Shared Task addresses the extraction of bio-molecular events from the biomedical literature. We participated in the “Event Detection and Characterization” task (Task 1). The goal was to recognize the events concerning the given proteins by detecting the event triggers, determining the event types, and identifying the event participants. We group the event types into three general classes based on the number and types of participants that they involve. The first class includes the event types that are described with a single theme participant. The second class includes the event types that are described with one or more theme participants. The third class includes the events that are described with a theme and/or a cause participant. We learn support vector machine (SVM) models for each class of events to classify each candidate event trigger/participant pair as a real trigger/participant pair or not. We use various types of linguistic features such as lexical, positional, and dependency relation features that represent the contexts of the candidate trigger/participant pairs.'
      }
    },
    {projectName: 'BioContext',
      data: {
        title: 'Extracting Non-local Context for Biomedical Information Extraction',
        people: [
          'Arzucan Ozgur',
          'Dragomir Radev'],
        image: '',
        links: {
          data: [],
          paper: [],
          demo: [],
          title: [],
          other: []},
        narrative: 'Most previous studies focus on extracting relationships between pairs of molecules. However, the context information such as the type, the directionality, the location, and the condition of the relationship are also important. While some types of context information such as the relationship type and directionality can be extracted locally from the sentence, other types of context information such as the experimental method and the species are not always found in the sentence, but need to be extracted non-locally from the entire document. We created guidelines for corpus annotation for non-local (document-level) context extraction. We are annotating full text articles for species mentions. The articles are retrieved from PubMed Central Open Access. We approach the problem as identifying the linguistic scope of each species mention in the article. We defined scope classes such as entity, sentence, paragraph, section, and article. For example, the scope of a species mention is entity level, if it applies to a certain entity (gene/protein) in the sentence. On the other hand, if it applies to all the entities in the paragraph its scope is defined to be paragraph level. The annotated corpus will enable us to learn models for identifying the species of the molecule mentions in the text.'
      }
    },
    {projectName: 'Speculation',
      data: {
        title: 'Detecting Speculations and Resolving their Scopes in Scientific Text',
        people: [
          'Arzucan Ozgur',
          'Dragomir Radev'
        ],
        image: '',
        links: {
          data: [],
          paper: [],
          demo: [],
          title: [],
          other: []},
        narrative: 'Speculation is a frequently used language phenomenon in biomedical scientific articles. When researchers are not completely certain about the inferred conclusions, they use speculative language to convey this uncertainty. While speculative information might still be useful for biomedical scientists, it is important that it is distinguished from the factual information. We introduce an approach which is based on solving two sub-problems to identify speculative sentence fragments. The first sub-problem is identifying the speculation keywords in the sentences and the second one is resolving their linguistic scopes. We formulate the first sub-problem as a supervised classification task, where we classify the potential keywords as real speculation keywords or not by using a diverse set of linguistic features that represent the contexts of the keywords. After detecting the actual speculation keywords, we use the syntactic structures of the sentences to determine their scopes.'
      }
    },
    {projectName: 'Tumbl',
      data: {
        title: 'Graph-Based Semi-supervised Learning',
        people: ['Dragomir Radev'],
        image: 'research_files/tumbl-330.png',
        links: {
          data: [],
          paper: [],
          demo: [
            {'text': 'Demo Site',
             'href': 'http://belobog.si.umich.edu/clair/tumbl'}
          ],
          title: [],
          other: []},
        narrative:'Tripartite updating is related to the principal eigenvector of a stochastic Markov process. This algorithm is a variant of the HITS algorithm (it uses a bipartite underlying structure and its stationary solution is computed iteratively), though it differs from it in three important ways: (a) the "right-hand" component of the graph is split into two groups: labeled and unlabeled data instances - therefore the name "tripartite", (b) there is an initial assignment of values for the labeled examples, and (c) the scores of the labeled examples are not allowed to change with time. '
      }
    },
    {projectName: 'LexRank',
      data: {
        title: 'Lexical Networks and Lexical Centrality',
        people: [
          'Gunes Erkan',
          'Jahna Otterbacher',
          'Dragomir Radev'],
        image: 'research_files/Lexnet-330.png',
        links: {
          data: [],
          paper: [],
          demo: [
            {'text': 'Lexical networks and lexical centrality',
             'href': 'http://belobog.si.umich.edu/clair/lexrank'},
            {'text': 'LexRankMead',
             'href': 'http://belobog.si.umich.edu/clair/clair/lexrankmead.html'}
          ],
          title: [],
          other: [
            {'text': 'Lexical networks',
             'href': 'http://belobog.si.umich.edu/clair/clair/lexnets.html'}
          ]
        },
        narrative:'We introduce a stochastic graph-based method for computing relative importance of textual units for Natural Language Processing. We consider a new approach, LexRank, for computing sentence importance based on the concept of eigenvector centrality in a graph representation of sentences. In this model, a connectivity matrix based on intra-sentence cosine similarity is used as the adjacency matrix of the graph representation of sentences. The results show that degree-based methods (including LexRank) outperform both centroid-based methods and other systems participating in DUC in most of the cases.'
      }
    },
    {projectName: 'MEAD',
      data: {
        title: 'Text Summarization',
        people: [ 'Dragomir Radev'],
        image: 'research_files/mead-330.png',
        links: {
          data: [],
          paper: [],
          demo: [{'text': 'MEAD', 'href': 'http://www.summarization.com/mead'},
                 {'text': 'NewsInEssence - Broken Link', 'href': 'http://www.newsinessence.com/'}
          ],
          title: [],
          other: [
            {'text': 'CSTBank',
             'href': 'http://belobog.si.umich.edu/clair/CSTBank/'},
            {'text': 'SUMMBANK',
             'href': 'http://www.summarization.com/summbank/'},
            {'text': 'Resources',
             'href': 'http://www.summarization.com/mead'}
          ]
        },
        narrative:'MEAD is the most elaborate publicly available platform for multi-lingual summarization and evaluation.The platform implements multiple summarization algorithms such as position-based, centroid-based, largest common subsequence, and keywords. The methods for evaluating the quality of the summaries are both intrinsic and extrinsic. MEAD implements a battery of summarization algorithms, including baselines (lead-based and random) as well as centroid-based and query-based methods.'
      }
    },
    {projectName: 'BlogoCenter',
      data: {
        title: 'Analysis of the Blogosphere',
        people: ['Ahmed Hassan',
                 'Vahed Qazvinian',
                 'Dragomir Radev'],
        image: 'research_files/Blogocenter-330.png',
        links: {
          data: [],
          paper: [],
          demo: [],
          title: [],
          other: [
            {'text': 'BlogoCenter',
             'href': 'http://belobog.si.umich.edu/clair/blogocenter/'}
          ]
        },
        narrative:'If journalists deliver the first draft of history; bloggers today often deliver the first draft of journalism. Never before have so many members of the human race recorded their thoughts and observations in a form so widely accessible.  Collectively referred to as the blogosphere, these sites are of enormous value for researchers across a huge swath of the arts and sciences, both now and far into the future.<br />The BlogoCenter system uses the latest in natural language processing tools to build a system that  (1) continuously monitors, collects, and stores personal Weblogs (or blogs) at a central location, (2) discovers hidden structures and trends automatically from the blogs, and (3) makes them easily accessible to general users. By making the new information on the blogs easy to discover and access, this project is helping blogs realize their full potential for societal change as the "grassroots media." It is also collecting an important hypertext dataset of human interactions for further analysis by the research community.<br /><br />There are two main objectives for this project. The first is efficient monitoring and collection of blogs. For that objective, we developed novel monitoring algorithms that discovers and downloads new information from rapidly-changing distributed sources with minimal delay.<br />Compared to the traditional Web, blogs are significantly more dynamic and their contents are highly time sensitive. In addition, blogs often exhibit patterns that are tightly connected to the general human behavior. We believe these distinctive characteristics make the traditional Web models (such as homogeneous Poisson model for Web page changes) and crawling algorithms inappropriate for the blog-data collection, necessitating the development of new techniques appropriate for the blogs. As part of this effort, a massive dataset of blogs was collected. RSS feeds from the Bloglines, Blogspot, Microsoft Live Spaces, and syndic8 aggregators have been retrieved for the past several years.<br />The dataset contains over 192 million blog posts.<br /><br />The second objective is using text and graph mining to develop novel and effective ranking and summarization algorithms for blogs. There are three distinctive characteristics of the blogs that make their ranking and summarization significantly different: (1) Compared to the traditional Web where Web pages are the basic unit of information, blogs are organized around a much smaller unit, called postings or articles. These articles are then concatenated (often in the reverse-chronological order) to form pages. (2) Articles on the blogs are time-stamped. These time stamps allow us to learn how quickly and in what manner particular information is spread on the blog. (3) The articles on each blog are typically authored by a single individual, so it is easier to establish the authorship of the blog articles. We used both content based and hyperlink based models to build blog ranking and recommendation system that can suggest blogs to read for users that have an interest in a particular topic. We also track how the interests in a a particular topic varies over time and use that to find out blogs that has continuous recurring interest in a a particular topic.'
      }
    },
    {projectName: 'AAN',
      data: {
        title: 'The ACL Anthology Network',
        people: ['Dragomir Radev',
                 'Pradeep Muthukrishnan'],
        image: 'research_files/AAN-330.png',
        links: {
          data: [],
          paper: [],
          demo: [],
          title: [],
          other: [
            {'text': 'ACL Anthology Network',
             'href': 'http://belobog.si.umich.edu/clair/anthology/'}
          ]
        },
        narrative:'ACL Anthology is a collection of research papers in the field of computational linguistics. After a lot of pre-processing the papers which involved extracting the text from PDF, cleaning up the results, we semi-automatically match citations to compute the paper citation network. Using the metadata about the papers which contains the authorship information, venue, year of publication, we have created auxiliary networks like author citation network and author collaboration network. We attempt to identify the most central papers, authors using different measures of impact and network centrality measures.<br />The extracted citation data has further been used for summarization of papers and can be used for computing better similarity measures which use both the text of the papers and the citation links.'
      }
    },
    {projectName: 'iOpener',
      data: {
        title: 'Summarizing Scientific Papers',
        people: [
          'Vahed Qazvinian',
          'Dragomir Radev'],
        image: 'research_files/citation_sentences-330.png',
        links: {
          data: [],
          paper: [],
          demo: [],
          title: [],
          other: [{'text': 'iOpener',
                   'href': 'http://belobog.si.umich.edu/clair/iopener/'}],
        },
        narrative:'With the emphasis on cross-disciplinary science growing, the need for researchers to rapidly learn about a new subject area has never been greater. An example might be an information scientist who must become versed in network analysis to understand journal articles on Internet use research. The iOPENER framework will automatically organize, summarize, and display comprehensive information about scientific topics in such a way that learners at any level from novice to domain expert can rapidly digest it. iOPENER will be particularly valuable in harvesting and presenting complex information that would otherwise be too dense and technical for all but a few specialists. <br /><br /> Our approach to such a system is based on three currently available technologies: (1) bibliometric lexical link mining that exploits the structure of citations and relations among citations; (2) summarization techniques that exploit the content of the material in both the citing and cited papers; and (3) visualization tools for displaying both structure and content. In iOpener we are trying to link these three technologies and evaluate different forms of presentation for rapid learning in unfamiliar research domains. <br /><br /> To tackle the problem of generating surveys, the first step is to summarize scientific articles. We have achieved this by developing systems that extract the main nuggets  of an article. In our work we use the citation summaries to understand the main contributions of articles. We have made programs to automatically extract such summaries, and have shown how citation summaries provide more coherent information than abstracts.  Moreover, we have investigated the usefulness of directly summarizing citation texts in the automatic creation of technical surveys. We automatically generated surveys of a wide range of topics including  Question Answering, Machine Translation, and Dependency Parsing using paper contents, abstracts, and their citation texts. Our evaluations confirm that both citation texts and abstracts have unique survey-worthy information.'
      }
    },
    {projectName: 'NSIR',
      data: {
        title: 'Question Answering',
        people: ['Dragomir Radev',
                 'Hong Qi',
                 'Gunes Erkan'],
        image: 'research_files/nsir-330.png',
        links: {
          data: [],
          paper: [],
          demo: [
              {'text': 'NSIR',
               'href': 'http://belobog.si.umich.edu/clair/NSIR/html/nsir.cgi'}],
          title: [],
          other: []},
        narrative: 'NSIR uses a fine question taxonomy, extracts candidate answers along with nine features: frequency, overlap, length, proximity, POSSIG, LEXSIG, local word list, named entity, and web ranking. Potential answers were ranked according to a set of techniques before they are returned to NSIR users. The proximity algorithm is based on the closeness in text between the question words and the neighbors of each phrasal answer. A potential answer that is spatially close to question words gets a higher score than one that is farther away. Probablistic phrase ranking takes expected answer type into consideration. Each phrase is assigned a probablibity score indicating the extent to which the phrase matches the expected answer type with respect to part-of-speech tag sentences.'
      }
    }
];
