Create a test repo

This mimicks the code in tests/test_search.py

In [1]:
! rm -rf data/paper-model2.yaml data/files data/index
! mkdir data/files
! mkdir data/index
In [2]:
from paperapp.paper_repo import PaperRepo
from paperapp.paper_ipython import *
from paperapp.paper_bibtex import import_bibtex

p = PaperRepo("data/paper-model2.yaml", "data/files", "data/index", auto_save=True)
In [3]:
ids = import_bibtex(p, os.path.join('..', 'tests', 'duboue.bib'), verbose=False, create_papers=True)

bib_to_paper = {}
for paper_id in ids:
    paper = p[paper_id]
    bib_to_paper[paper['bibtex']['id']] = paper

ondisk = [ ( 'duboue2019dialect', 'CLEI2019dialect.pdf' ),
           ( 'ying-duboue-2019-rationale', 'W19-5503.pdf' ),
           ( 'duboue2018choice', 'ASAI2018choice.pdf' ),
           ( 'Duboue_al_2013' , '05-NTCIR10-1CLICK-PabloD.pdf' ),
           ( 'duboue_2013_ENLG2', 'W13-2129.pdf' ),
           ( 'Jing_al_12', 'C12-1069.pdf' ),
           ( 'Pacheco_al_12', 'N12-1082.pdf' ),
           ( 'Duboue_12', 'INLG2012duboue.pdf' ),
           ( 'Ferrucci_al_08', 'rc24789.pdf' ),
           ( 'Duboue_Chu-Carrol_06', 'N06-2009.pdf' ),
           ( 'Prager_al_06', 'P06-1135.pdf' ),
           ( 'Duboue_McKeown_03a', 'W03-1016.pdf' ),
           ( 'Costa_Duboue_04', 'ASAI04distributed.pdf' ),
           ( 'Hatzivassiloglou_al_01a', 'ISMB2001disambiguation.pdf' ) ]
    
for entry, fname in ondisk:
    bib_to_paper[entry]['on-disk'] = p.register_file(os.path.join("..", "tests", fname))
    
print("Imported ", len(ids), " entries")
print("Index contains", len(ondisk), " files")
Imported  44  entries
Index contains 14  files

Access the extracted text

In [4]:
print(p.text(bib_to_paper['Ferrucci_al_08'])[:500])
RC24789 (W0904-093) April 22, 2009
Computer Science




                        IBM Research Report
             Towards the Open Advancement of Question
                         Answering Systems

   David Ferrucci1, Eric Nyberg2, James Allan3, Ken Barker4, Eric Brown1,
    Jennifer Chu-Carroll1, Arthur Ciccolo1, Pablo Duboue1, James Fan1,
 David Gondek1, Eduard Hovy5, Boris Katz6, Adam Lally1, Michael McCord1,
       Paul Morarescu1, Bill Murdock1, Bruce Porter4, John Prager1,
          Tomek 

Similarity-based queries

In [5]:
base = bib_to_paper['Ferrucci_al_08']
results = p.similarto(base)
print("Papers similar to ", base['text'],"\n")
for idx, pp in enumerate(results):
    print("\t", (idx+1), "\t", pp['paper']['text'])
Papers similar to  Towards the Open Advancement of Question Answering Systems -- David Ferrucci and Eric Nyberg and James Allen and Ken Barker and Eric W. Brown and Jennifer Chu-Carroll and Arthur Ciccolo and Pablo A. Duboue and James Fan and David Gondek and Edward Hovy and Boris Katz and Adam Lally and Michael McCord and Paul Morarescu and J. William Murdock and Bruce Porter and John M. Prager and Tomek Strzalkowski and Christopher Welty and Wlodek Zadrozny 

	 1 	 Improving {QA} Accuracy by Question Inversion -- John M. Prager and Pablo A. Duboue and Jennifer Chu-Carroll
	 2 	 Answering the question you wish they had asked: The impact of paraphrasing for Question Answering -- Pablo A. Duboue and Jennifer Chu-Carroll
	 3 	 Hunter Gatherer: UdeM at 1Click-2 -- Duboue, Pablo and He, Jing and Nie, Jian-Yun
	 4 	 Bridging the Gap between Intrinsic and Perceived Relevance in Snippet Generation -- He, Jing and Duboue, Pablo and Nie,  Jian-Yun
	 5 	 Disambiguating Proteins, Genes, and {RNA} in Text: A Machine Learning Approach -- Vasileios Hatzivassiloglou and Pablo A. Duboue and Andrey Rzhetsky
	 6 	 Rationale Classification for Educational Trading Platforms -- Ying, Annie  and  Duboue, Pablo
	 7 	 Deobfuscating Name Scrambling as a Natural Language Generation Task -- Duboue, Pablo Ariel
	 8 	 Impact of Spanish Dialect in Deep Learning Next Sentence Predictors -- Duboue, Pablo Ariel
	 9 	 On The Feasibility of Open Domain Referring Expression Generation Using Large Scale Folksonomies -- Pacheco, Fabian  and  Duboue, Pablo  and  Dominguez, Martin
	 10 	 Distributed Ontological Encoding Through Symbol Recirculation -- Maria Jimena Costa and Pablo A. Duboue
	 11 	 Statistical Acquisition of Content Selection Rules for Natural Language Generation -- Pablo A. Duboue and Kathleen R. McKeown
	 12 	 Thoughtland: Natural Language Descriptions for Machine Learning n-dimensional Error Functions -- Duboue, Pablo
	 13 	 Extractive email thread summarization: Can we do better than He Said She Said? -- Pablo A. Duboue
In [6]:
base = bib_to_paper['Costa_Duboue_04']
results = p.similarto(base, limit=5)
print("Papers similar to ", base['text'],"\n")
for idx, pp in enumerate(results):
    print("\t", (idx+1), "\t", pp['paper']['text'])
Papers similar to  Distributed Ontological Encoding Through Symbol Recirculation -- Maria Jimena Costa and Pablo A. Duboue 

	 1 	 Rationale Classification for Educational Trading Platforms -- Ying, Annie  and  Duboue, Pablo
	 2 	 Impact of Spanish Dialect in Deep Learning Next Sentence Predictors -- Duboue, Pablo Ariel
	 3 	 Disambiguating Proteins, Genes, and {RNA} in Text: A Machine Learning Approach -- Vasileios Hatzivassiloglou and Pablo A. Duboue and Andrey Rzhetsky
	 4 	 Improving {QA} Accuracy by Question Inversion -- John M. Prager and Pablo A. Duboue and Jennifer Chu-Carroll
	 5 	 Deobfuscating Name Scrambling as a Natural Language Generation Task -- Duboue, Pablo Ariel

Text queries

In [7]:
for idx, pp in enumerate(p.search("question answering", limit=5)):
    print("\t", (idx+1), "\t", pp['paper']['text'])
	 1 	 Answering the question you wish they had asked: The impact of paraphrasing for Question Answering -- Pablo A. Duboue and Jennifer Chu-Carroll
	 2 	 Improving {QA} Accuracy by Question Inversion -- John M. Prager and Pablo A. Duboue and Jennifer Chu-Carroll
	 3 	 Towards the Open Advancement of Question Answering Systems -- David Ferrucci and Eric Nyberg and James Allen and Ken Barker and Eric W. Brown and Jennifer Chu-Carroll and Arthur Ciccolo and Pablo A. Duboue and James Fan and David Gondek and Edward Hovy and Boris Katz and Adam Lally and Michael McCord and Paul Morarescu and J. William Murdock and Bruce Porter and John M. Prager and Tomek Strzalkowski and Christopher Welty and Wlodek Zadrozny
	 4 	 Hunter Gatherer: UdeM at 1Click-2 -- Duboue, Pablo and He, Jing and Nie, Jian-Yun
	 5 	 Extractive email thread summarization: Can we do better than He Said She Said? -- Pablo A. Duboue
In [8]:
for idx, pp in enumerate(p.search("summarization", limit=5)):
    print("\t", (idx+1), "\t", pp['paper']['text'])
	 1 	 Bridging the Gap between Intrinsic and Perceived Relevance in Snippet Generation -- He, Jing and Duboue, Pablo and Nie,  Jian-Yun
	 2 	 Extractive email thread summarization: Can we do better than He Said She Said? -- Pablo A. Duboue
	 3 	 On The Feasibility of Open Domain Referring Expression Generation Using Large Scale Folksonomies -- Pacheco, Fabian  and  Duboue, Pablo  and  Dominguez, Martin
	 4 	 Hunter Gatherer: UdeM at 1Click-2 -- Duboue, Pablo and He, Jing and Nie, Jian-Yun
	 5 	 Deobfuscating Name Scrambling as a Natural Language Generation Task -- Duboue, Pablo Ariel

Boolean queries

In [9]:
for idx, pp in enumerate(p.search("summarization AND NOT ( question answering )", limit=5)):
    print("\t", (idx+1), "\t", pp['paper']['text'])
	 1 	 Bridging the Gap between Intrinsic and Perceived Relevance in Snippet Generation -- He, Jing and Duboue, Pablo and Nie,  Jian-Yun
	 2 	 On The Feasibility of Open Domain Referring Expression Generation Using Large Scale Folksonomies -- Pacheco, Fabian  and  Duboue, Pablo  and  Dominguez, Martin
	 3 	 Deobfuscating Name Scrambling as a Natural Language Generation Task -- Duboue, Pablo Ariel
In [10]:
for idx, pp in enumerate(p.search("feature AND selection AND NOT ( dimensionality OR reduction)")):
    print("\t", (idx+1), "\t", pp['paper']['text'])
	 1 	 Bridging the Gap between Intrinsic and Perceived Relevance in Snippet Generation -- He, Jing and Duboue, Pablo and Nie,  Jian-Yun
	 2 	 Improving {QA} Accuracy by Question Inversion -- John M. Prager and Pablo A. Duboue and Jennifer Chu-Carroll
In [11]:
for idx, pp in enumerate(p.search("feature AND selection")):
    print("\t", (idx+1), "\t", pp['paper']['text'])
	 1 	 Answering the question you wish they had asked: The impact of paraphrasing for Question Answering -- Pablo A. Duboue and Jennifer Chu-Carroll
	 2 	 Statistical Acquisition of Content Selection Rules for Natural Language Generation -- Pablo A. Duboue and Kathleen R. McKeown
	 3 	 Bridging the Gap between Intrinsic and Perceived Relevance in Snippet Generation -- He, Jing and Duboue, Pablo and Nie,  Jian-Yun
	 4 	 Improving {QA} Accuracy by Question Inversion -- John M. Prager and Pablo A. Duboue and Jennifer Chu-Carroll