Skip to content

Commit 5593ad9

Browse files
author
Daniel Standage
authored
Support for taxonkit 0.6.1, including new "name" function (#2)
This PR takes advantage of taxonkit 0.6.1's new `--show-name` and `--no-lineage` flags to add a new `name()` function to pytaxonkit.
1 parent 9b47894 commit 5593ad9

4 files changed

Lines changed: 105 additions & 25 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,4 @@ __pycache__/
33
.coverage
44
dist/
55
build/
6+
sandbox/

README.md

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,8 @@ Execute `help(pytaxonkit.name2taxid)` (and so on) from the Python interpreter fo
5050
>>> import pytaxonkit
5151
>>> result = pytaxonkit.lineage([7399, 1973489])
5252
>>> result.columns
53-
Index(['TaxID', 'Code', 'Lineage', 'LineageTaxIDs', 'Rank', 'FullLineage',
54-
'FullLineageTaxIDs'],
53+
Index(['TaxID', 'Code', 'Name', 'Lineage', 'LineageTaxIDs', 'Rank',
54+
'FullLineage', 'FullLineageTaxIDs'],
5555
dtype='object')
5656
>>> result[['TaxID', 'Lineage', 'LineageTaxIDs']]
5757
TaxID Lineage LineageTaxIDs
@@ -62,6 +62,16 @@ Index(['TaxID', 'Code', 'Lineage', 'LineageTaxIDs', 'Rank', 'FullLineage',
6262
'Lactobacillaceae;Lactobacillus;Lactobacillus delbrueckii;Lactobacillus delbrueckii subsp. bulgaricus'
6363
```
6464

65+
### name
66+
67+
```python
68+
>>> import pytaxonkit
69+
>>> pytaxonkit.name(['274127', 511170])
70+
TaxID Name
71+
0 274127 Distathma
72+
1 511170 Delicatula
73+
```
74+
6575
### list
6676

6777
```python
@@ -74,7 +84,7 @@ Index(['TaxID', 'Code', 'Lineage', 'LineageTaxIDs', 'Rank', 'FullLineage',
7484
Top level result: Solenopsis (13685); 198 related taxa
7585
Top level result: Bos (9903); 26 related taxa
7686
>>> subtaxa[0]
77-
Taxon(taxid=9904, rank='species', name='Bos gaurus')
87+
BasicTaxon(taxid=9904, rank='species', name='Bos gaurus')
7888
>>> pytaxonkit.list([9605], raw=True)
7989
{'9605 [genus] Homo': {'9606 [species] Homo sapiens': {'63221 [subspecies] Homo sapiens neanderthalensis': {}, "741158 [subspecies]Homo sapiens subsp. 'Denisova'": {}, '2665952 [no rank] environmental samples': {'2665953 [species] Homo sapiens environmentalsample': {}}}, '1425170 [species] Homo heidelbergensis': {}}}
8090
```
@@ -84,9 +94,9 @@ Taxon(taxid=9904, rank='species', name='Bos gaurus')
8494
```python
8595
>>> import pytaxonkit
8696
>>> pytaxonkit.__version__
87-
0.6
97+
0.6.1
8898
>>> pytaxonkit.__taxonkitversion__
89-
0.6.0
99+
0.6.1
90100
```
91101

92102

pytaxonkit.py

Lines changed: 87 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -191,12 +191,12 @@ def list(ids, raw=False, threads=None, data_dir=None, debug=False):
191191
... subtaxa = [t for t in tree.traverse]
192192
... print(f'Top level result: {taxon.name} ({taxon.taxid}); {len(subtaxa)} related taxa')
193193
...
194-
Top level result: Solenopsis (13685); 198 related taxa
195-
Top level result: Bos (9903); 26 related taxa
194+
Top level result: Solenopsis (13685); 293 related taxa
195+
Top level result: Bos (9903); 27 related taxa
196196
>>> subtaxa[0]
197197
BasicTaxon(taxid=9904, rank='species', name='Bos gaurus')
198198
>>> pytaxonkit.list([9605], raw=True)
199-
{'9605 [genus] Homo': {'9606 [species] Homo sapiens': {'63221 [subspecies] Homo sapiens neanderthalensis': {}, "741158 [subspecies] Homo sapiens subsp. 'Denisova'": {}, '2665952 [no rank] environmental samples': {'2665953 [species] Homo sapiens environmental sample': {}}}, '1425170 [species] Homo heidelbergensis': {}}}
199+
{'9605 [genus] Homo': {'9606 [species] Homo sapiens': {'63221 [subspecies] Homo sapiens neanderthalensis': {}, "741158 [subspecies] Homo sapiens subsp. 'Denisova'": {}}, '1425170 [species] Homo heidelbergensis': {}, '2665952 [no rank] environmental samples': {'2665953 [species] Homo sapiens environmental sample': {}}}}
200200
''' # noqa: E501
201201
idlist = ','.join(map(str, ids))
202202
arglist = ['taxonkit', 'list', '--json', '--show-name', '--show-rank', '--ids', idlist]
@@ -298,14 +298,18 @@ def lineage(ids, formatstr=None, threads=None, data_dir=None, debug=False):
298298
299299
Examples
300300
--------
301+
>>> import pandas
301302
>>> import pytaxonkit
302-
>>> result = pytaxonkit.lineage([7399, 1973489])
303+
>>> result = pytaxonkit.lineage([1325911, 1649473, 1401311])
303304
>>> result.columns
304-
Index(['TaxID', 'Code', 'Lineage', 'LineageTaxIDs', 'Rank', 'FullLineage', 'FullLineageTaxIDs'], dtype='object')
305+
Index(['TaxID', 'Code', 'Name', 'Lineage', 'LineageTaxIDs', 'Rank',
306+
'FullLineage', 'FullLineageTaxIDs'],
307+
dtype='object')
305308
>>> result[['TaxID', 'Lineage', 'LineageTaxIDs']]
306-
TaxID Lineage LineageTaxIDs
307-
0 7399 Eukaryota;Arthropoda;Insecta;Hymenoptera;;; 2759;6656;50557;7399;;;
308-
1 1973489 Bacteria;Firmicutes;Bacilli;Bacillales;Bacillaceae;Bacillus;Bacillus sp. ISSFR-25F 2;1239;91061;1385;186817;1386;1973489
309+
TaxID Lineage LineageTaxIDs
310+
0 1325911 Eukaryota;Arthropoda;Insecta;Hymenoptera;Eucharitidae;Pogonocharis; 2759;6656;50557;7399;216140;1325911;
311+
1 1649473 Bacteria;Bacteroidetes;Cytophagia;Cytophagales;Cytophagaceae;Nibrella; 2;976;768503;768507;89373;1649473;
312+
2 1401311 Eukaryota;Arthropoda;Insecta;Coleoptera;Staphylinidae;Styngetus; 2759;6656;50557;7041;29026;1401311;
309313
>>> result = pytaxonkit.lineage(['1382510', '929505', '390333'], formatstr='{f};{g};{s};{S}')
310314
>>> result[['TaxID', 'Lineage', 'LineageTaxIDs']]
311315
TaxID Lineage LineageTaxIDs
@@ -314,13 +318,16 @@ def lineage(ids, formatstr=None, threads=None, data_dir=None, debug=False):
314318
2 390333 Lactobacillaceae;Lactobacillus;Lactobacillus delbrueckii;Lactobacillus delbrueckii subsp. bulgaricus 33958;1578;1584;1585
315319
''' # noqa: E501
316320
idlist = '\n'.join(map(str, ids))
317-
arglist = ['taxonkit', 'lineage', '--show-lineage-taxids', '--show-rank', '--show-status-code']
321+
arglist = [
322+
'taxonkit', 'lineage', '--show-lineage-taxids', '--show-rank', '--show-status-code',
323+
'--show-name'
324+
]
318325
if threads:
319326
arglist.extend(('--threads', validate_threads(threads)))
320327
if data_dir:
321328
arglist.extend(('--data-dir', validate_data_dir(data_dir))) # pragma: no cover
322329
if debug:
323-
log(*arglist) # pragma: no cover
330+
log(*arglist)
324331
with NamedTemporaryFile(suffix='-lineage.txt') as lineagefile:
325332
proc = Popen(arglist, stdin=PIPE, stdout=lineagefile, stderr=PIPE, universal_newlines=True)
326333
out, err = proc.communicate(input=idlist)
@@ -346,10 +353,12 @@ def lineage(ids, formatstr=None, threads=None, data_dir=None, debug=False):
346353
if proc.returncode != 0:
347354
raise TaxonKitCLIError(err) # pragma: no cover
348355
columnorderin = [
349-
'TaxID', 'Code', 'FullLineage', 'FullLineageTaxIDs', 'Rank', 'Lineage', 'LineageTaxIDs'
356+
'TaxID', 'Code', 'FullLineage', 'FullLineageTaxIDs', 'Name', 'Rank', 'Lineage',
357+
'LineageTaxIDs'
350358
]
351359
columnorderout = [
352-
'TaxID', 'Code', 'Lineage', 'LineageTaxIDs', 'Rank', 'FullLineage', 'FullLineageTaxIDs'
360+
'TaxID', 'Code', 'Name', 'Lineage', 'LineageTaxIDs', 'Rank', 'FullLineage',
361+
'FullLineageTaxIDs'
353362
]
354363
data = pandas.read_csv(
355364
StringIO(out), sep='\t', header=None, names=columnorderin, index_col=False
@@ -358,26 +367,69 @@ def lineage(ids, formatstr=None, threads=None, data_dir=None, debug=False):
358367
return data
359368

360369

370+
def name(ids, data_dir=None, debug=False):
371+
'''rapid taxon name retrieval
372+
373+
Uses the `--no-linage` option in `taxonkit lineage` for rapid retrieval of taxon names.
374+
375+
Parameters
376+
----------
377+
ids : list or iterable
378+
A list of taxids (ints or strings are ok)
379+
data_dir : str, default None
380+
Specify the location of the NCBI taxonomy `.dmp` files; by default, taxonkit searches in
381+
`~/.taxonkit/`
382+
debug : bool, default False
383+
Print debugging output, e.g., system calls to `taxonkit`
384+
385+
Returns
386+
-------
387+
DataFrame
388+
A two-dimensional data structure with TaxIDs and taxon names.
389+
390+
Examples
391+
--------
392+
>>> import pytaxonkit
393+
>>> name(['151837', '2216222', '517824'])
394+
TaxID Name
395+
0 151837 Hiraea smilacina
396+
1 2216222 Paramyia sp. BIOUG21706-A10
397+
2 517824 soil bacterium Cipr-S1N-M1LLLSSL-1
398+
'''
399+
idlist = '\n'.join(map(str, ids))
400+
arglist = ['taxonkit', 'lineage', '--show-name', '--no-lineage']
401+
if data_dir:
402+
arglist.extend(('--data-dir', validate_data_dir(data_dir))) # pragma: no cover
403+
if debug:
404+
log(*arglist)
405+
proc = Popen(arglist, stdin=PIPE, stdout=PIPE, stderr=PIPE, universal_newlines=True)
406+
out, err = proc.communicate(input=idlist)
407+
data = pandas.read_csv(
408+
StringIO(out), sep='\t', header=None, names=['TaxID', 'Name'], index_col=False
409+
)
410+
return data
411+
412+
361413
def test_lineage(capsys):
362-
result = lineage(['446045', '265720', '2507530', '106649'], debug=True)
363-
assert result.TaxID.equals(pandas.Series([446045, 265720, 2507530, 106649]))
364-
assert result.Code.equals(pandas.Series([446045, 265720, 2507530, 106649]))
414+
result = lineage(['1082657', '265720', '2507530', '106649'], debug=True)
415+
assert result.TaxID.equals(pandas.Series([1082657, 265720, 2507530, 106649]))
416+
assert result.Code.equals(pandas.Series([1082657, 265720, 2507530, 106649]))
365417
assert result.Lineage.equals(pandas.Series([
366-
'Eukaryota;Arthropoda;Insecta;Diptera;Drosophilidae;Drosophila;',
418+
'Eukaryota;Discosea;;Longamoebia;Acanthamoebidae;Acanthamoeba;Acanthamoeba sp. TW95',
367419
'Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Porphyromonadaceae;Porphyromonas;'
368420
'Porphyromonas genomosp. P3',
369421
'Eukaryota;Basidiomycota;Agaricomycetes;Russulales;Russulaceae;Russula;Russula species',
370422
'Bacteria;Proteobacteria;Gammaproteobacteria;Pseudomonadales;Moraxellaceae;Acinetobacter;'
371423
'Acinetobacter guillouiae',
372424
]))
373425
assert result.LineageTaxIDs.equals(pandas.Series([
374-
'2759;6656;50557;7147;7214;7215;',
426+
'2759;555280;;1485168;33677;5754;1082657',
375427
'2;976;200643;171549;171551;836;265720',
376428
'2759;5204;155619;452342;5401;5402;2507520',
377429
'2;1224;1236;72274;468;469;106649',
378430
]))
379431
assert result.Rank.equals(pandas.Series([
380-
'no rank', 'species', 'subspecies', 'species'
432+
'species', 'species', 'subspecies', 'species'
381433
]))
382434

383435
out, err = capsys.readouterr()
@@ -393,6 +445,22 @@ def test_lineage_threads():
393445
)
394446

395447

448+
def test_lineage_name():
449+
result = lineage(['526061'])
450+
assert result.Name.iloc[0] == 'Henosepilachna sp. AGBA-2008'
451+
452+
453+
def test_name_debug(capsys):
454+
result = name([207661, 1353792, 1597281], debug=True)
455+
assert result.Name.equals(pandas.Series([
456+
'Ahnfeltiopsis concinna',
457+
'Picobirnavirus turkey/USA-1512/2010',
458+
'Isopoda sp. NZAC 03013534',
459+
]))
460+
out, err = capsys.readouterr()
461+
assert 'taxonkit lineage --show-name --no-lineage' in err
462+
463+
396464
# -------------------------------------------------------------------------------------------------
397465
# taxonkit name2taxid
398466
# -------------------------------------------------------------------------------------------------
@@ -473,4 +541,4 @@ def test_name2taxid(capsys):
473541

474542
def test_name2taxid_threads():
475543
result = name2taxid(['FCB group'], threads='1')
476-
assert str(result) == ' Name TaxID Rank\n0 FCB group 1783270 no rank'
544+
assert str(result) == ' Name TaxID Rank\n0 FCB group 1783270 clade'

sweetleaf.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,8 @@
214214
"1679792 [species] Symplocos sp. 2 XCH-2015": {},
215215
"2048312 [species] Symplocos sp. JH-2017": {},
216216
"2054402 [species] Symplocos sp. US:3560071": {},
217-
"2688201 [species] Symplocos sp. NTT-2019": {}
217+
"2688201 [species] Symplocos sp. NTT-2019": {},
218+
"2744220 [species] Symplocos sp. Zhang 4651": {}
218219
},
219220
"2662168 [species] Symplocos guianensis": {},
220221
"2687301 [species] Symplocos cambodiana": {},

0 commit comments

Comments
 (0)