This mimicks the code in tests/test_search.py
! rm -rf data/paper-model2.yaml data/files data/index
! mkdir data/files
! mkdir data/index
from paperapp.paper_repo import PaperRepo
from paperapp.paper_ipython import *
from paperapp.paper_bibtex import import_bibtex
p = PaperRepo("data/paper-model2.yaml", "data/files", "data/index", auto_save=True)
ids = import_bibtex(p, os.path.join('..', 'tests', 'duboue.bib'), verbose=False, create_papers=True)
bib_to_paper = {}
for paper_id in ids:
paper = p[paper_id]
bib_to_paper[paper['bibtex']['id']] = paper
ondisk = [ ( 'duboue2019dialect', 'CLEI2019dialect.pdf' ),
( 'ying-duboue-2019-rationale', 'W19-5503.pdf' ),
( 'duboue2018choice', 'ASAI2018choice.pdf' ),
( 'Duboue_al_2013' , '05-NTCIR10-1CLICK-PabloD.pdf' ),
( 'duboue_2013_ENLG2', 'W13-2129.pdf' ),
( 'Jing_al_12', 'C12-1069.pdf' ),
( 'Pacheco_al_12', 'N12-1082.pdf' ),
( 'Duboue_12', 'INLG2012duboue.pdf' ),
( 'Ferrucci_al_08', 'rc24789.pdf' ),
( 'Duboue_Chu-Carrol_06', 'N06-2009.pdf' ),
( 'Prager_al_06', 'P06-1135.pdf' ),
( 'Duboue_McKeown_03a', 'W03-1016.pdf' ),
( 'Costa_Duboue_04', 'ASAI04distributed.pdf' ),
( 'Hatzivassiloglou_al_01a', 'ISMB2001disambiguation.pdf' ) ]
for entry, fname in ondisk:
bib_to_paper[entry]['on-disk'] = p.register_file(os.path.join("..", "tests", fname))
print("Imported ", len(ids), " entries")
print("Index contains", len(ondisk), " files")
print(p.text(bib_to_paper['Ferrucci_al_08'])[:500])
base = bib_to_paper['Ferrucci_al_08']
results = p.similarto(base)
print("Papers similar to ", base['text'],"\n")
for idx, pp in enumerate(results):
print("\t", (idx+1), "\t", pp['paper']['text'])
base = bib_to_paper['Costa_Duboue_04']
results = p.similarto(base, limit=5)
print("Papers similar to ", base['text'],"\n")
for idx, pp in enumerate(results):
print("\t", (idx+1), "\t", pp['paper']['text'])
for idx, pp in enumerate(p.search("question answering", limit=5)):
print("\t", (idx+1), "\t", pp['paper']['text'])
for idx, pp in enumerate(p.search("summarization", limit=5)):
print("\t", (idx+1), "\t", pp['paper']['text'])
for idx, pp in enumerate(p.search("summarization AND NOT ( question answering )", limit=5)):
print("\t", (idx+1), "\t", pp['paper']['text'])
for idx, pp in enumerate(p.search("feature AND selection AND NOT ( dimensionality OR reduction)")):
print("\t", (idx+1), "\t", pp['paper']['text'])
for idx, pp in enumerate(p.search("feature AND selection")):
print("\t", (idx+1), "\t", pp['paper']['text'])