-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathget_links.py
More file actions
61 lines (52 loc) · 2.62 KB
/
get_links.py
File metadata and controls
61 lines (52 loc) · 2.62 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
"""Main file to retrieve arXiv categories from arXiv API."""
import argparse
from pathlib import PurePath, Path
from datetime import datetime
import pandas as pd
from src.query_making.query_arxiv_cat import QueryArXivCat
if __name__ == '__main__' :
# Get command line args
parser = argparse.ArgumentParser(description=('Makes queries and does them '
'on the arXiv API in order to create a database.'))
parser.add_argument('--output',
dest='output',
action='store',
help='path to the output folder',
default=PurePath('data'))
parser.add_argument('--categories',
dest='categories_path',
action='store',
help='path to the file containing the list of categories',
default=PurePath('data').joinpath('crtc_info.csv'))
args = parser.parse_args()
# Creation of folders
cur_time = datetime.now().strftime('%Y_%m_%d')
# Cleaning path for Windows compatibility
e_prints_path = PurePath(args.output).joinpath('query_e-prints_' + cur_time)
cyber_e_prints_path = PurePath(args.output).joinpath('cyber_e_prints_' + cur_time)
args.categories_path = PurePath(args.categories_path)
# mkdir
Path(e_prints_path).mkdir(parents=True, exist_ok=True)
Path(cyber_e_prints_path).mkdir(parents=True, exist_ok=True)
# Format queries
query_formater = lambda key: 'ti:' + key + ' OR abs:' + key
cyber_keywords = '%28' + query_formater('secur*') + ' OR ' \
+ query_formater('safe*') + ' OR ' \
+ query_formater('reliability') + ' OR ' \
+ query_formater('dependability') + ' OR ' \
+ query_formater('confidentiality') + ' OR ' \
+ query_formater('integrity') + ' OR ' \
+ query_formater('availability') + ' OR ' \
+ query_formater('defen*') + ' OR ' \
+ query_formater('priva*') + '%29'
# Open the list of categories
categories = pd.read_csv(args.categories_path,
sep=';',
dtype='string')['CRTC']
for category in categories.values:
# All e-prints in a category
query = QueryArXivCat(e_prints_path, category)
query.processing()
# Only e-prints with security considerations in a category
query = QueryArXivCat(cyber_e_prints_path, category, cyber_keywords)
query.processing()