Skip to content

Commit 0953148

Browse files
committed
Add XMP metadata support for MP4 files
This patch adds basic XMP metadata support for MP4 files, using XMP data as a fallback when standard MP4 metadata atoms are missing. Implementation: - Parse XMP data from standard UUID atoms (BE7ACFCB-97A9-42E8-9C71-999491E3AFAC) - Support Dublin Core metadata fields (dc:title, dc:creator, dc:date, etc.) - Parse structured filenames from stRef:filePath references in XMP - Single-pass parsing: capture XMP during atom traversal - Only process XMP when standard metadata is missing (fallback behavior) - Clean up internal XMP data field after processing XMP metadata is only used when the corresponding standard MP4 metadata field is not present, ensuring compatibility with existing behavior while providing fallback metadata extraction for files with incomplete tagging.
1 parent 7221f79 commit 0953148

1 file changed

Lines changed: 213 additions & 3 deletions

File tree

tinytag/tinytag.py

Lines changed: 213 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ def __init__(self) -> None:
9999
self.genre: str | None = None
100100
self._genre_text: str | None = None # From ©gen text atom
101101
self._genre_binary: str | None = None # From gnre binary atom
102+
self._xmp_data: bytes | None = None # XMP data from uuid atoms
102103
self.year: str | None = None
103104
self.comment: str | None = None
104105

@@ -280,6 +281,9 @@ def _cleanup_internal_fields(self) -> None:
280281
delattr(self, '_genre_text')
281282
if hasattr(self, '_genre_binary'):
282283
delattr(self, '_genre_binary')
284+
# Remove XMP processing fields
285+
if hasattr(self, '_xmp_data'):
286+
delattr(self, '_xmp_data')
283287

284288
def _set_field(self, fieldname: str, value: str | float,
285289
check_conflict: bool = True) -> None:
@@ -524,7 +528,9 @@ def _parse_tag(self, fh: BinaryIO) -> None:
524528
# the atom data. Callables return {fieldname: value} which is updates
525529
# the TinyTag.
526530
if _MP4._meta_data_tree is None:
527-
_MP4._meta_data_tree = {b'moov': {b'udta': {b'meta': {b'ilst': {
531+
_MP4._meta_data_tree = {
532+
b'uuid': lambda atom_data: {}, # handled by special case
533+
b'moov': {b'udta': {b'meta': {b'ilst': {
528534
# http://atomicparsley.sourceforge.net/mpeg-4files.html
529535
# https://metacpan.org/dist/Image-ExifTool/source/lib/Image/ExifTool/QuickTime.pm#L3093
530536
b'\xa9ART': {b'data': _MP4._data_parser('artist')},
@@ -551,10 +557,13 @@ def _parse_tag(self, fh: BinaryIO) -> None:
551557
b'tmpo': {b'data': _MP4._data_parser('other.bpm')},
552558
b'covr': {b'data': _MP4._parse_cover_image},
553559
b'----': _MP4._parse_custom_field,
554-
}}}}}
560+
}}}},
561+
}
555562
self._traverse_atoms(fh, path=_MP4._meta_data_tree)
556563
# Apply genre priority: prefer ©gen text over gnre binary
557564
self._resolve_mp4_genre()
565+
# Apply XMP metadata as fallback for missing fields
566+
self._apply_xmp_metadata_fallback()
558567

559568
def _resolve_mp4_genre(self) -> None:
560569
"""Apply MP4 genre priority: prefer ©gen text over gnre binary."""
@@ -568,6 +577,45 @@ def _resolve_mp4_genre(self) -> None:
568577
self._genre_text = None
569578
self._genre_binary = None
570579

580+
def _apply_xmp_metadata_fallback(self) -> None:
581+
"""Apply XMP metadata as fallback for missing MP4 fields."""
582+
# Only process XMP if we have missing metadata and captured XMP data
583+
if (self._xmp_data and
584+
(not self.title or not self.artist or not self.album or
585+
not self.year or not self.comment)):
586+
587+
# Process the stored XMP data
588+
xmp_metadata = self._parse_xmp_metadata(self._xmp_data)
589+
590+
# Only use XMP data if TinyTag didn't find the corresponding fields
591+
title = xmp_metadata.get("title")
592+
if title and isinstance(title, str) and not self.title:
593+
self.title = title
594+
artist = xmp_metadata.get("artist")
595+
if artist and isinstance(artist, str) and not self.artist:
596+
self.artist = artist
597+
album = xmp_metadata.get("album")
598+
if album and isinstance(album, str) and not self.album:
599+
self.album = album
600+
track = xmp_metadata.get("track")
601+
if track and not self.track:
602+
try:
603+
if isinstance(track, str):
604+
self.track = int(track)
605+
elif isinstance(track, int):
606+
self.track = track
607+
except (ValueError, TypeError):
608+
pass
609+
year = xmp_metadata.get("year")
610+
if year and isinstance(year, str) and not self.year:
611+
self.year = year
612+
comment = xmp_metadata.get("comment")
613+
if comment and isinstance(comment, str) and not self.comment:
614+
self.comment = comment
615+
616+
# Clear XMP data after processing
617+
self._xmp_data = None
618+
571619
def _traverse_atoms(self,
572620
fh: BinaryIO,
573621
path: _DataTreeDict,
@@ -621,7 +669,15 @@ def _traverse_atoms(self,
621669
curr_path=curr_path + [atom_type])
622670
# if the path-leaf is a callable, call it on the atom data
623671
elif callable(sub_path):
624-
for fieldname, value in sub_path(fh.read(atom_size)).items():
672+
atom_data = fh.read(atom_size)
673+
# Special handling for UUID atoms to capture XMP data
674+
if atom_type == b'uuid':
675+
self._parse_uuid_atom(atom_data)
676+
result_dict = {}
677+
else:
678+
result_dict = sub_path(atom_data)
679+
680+
for fieldname, value in result_dict.items():
625681
if _DEBUG:
626682
print(' ' * 4 * len(curr_path), 'FIELD: ', fieldname)
627683
if isinstance(value, Image):
@@ -699,6 +755,160 @@ def _parse_cover_image(cls, data_atom: bytes) -> dict[str, Image]:
699755
'front_cover', data_atom[8:], cls._IMAGE_MIME_TYPES.get(data_type))
700756
return {'images.front_cover': image}
701757

758+
def _parse_uuid_atom(
759+
self, atom_data: bytes
760+
) -> dict[str, str | int | bool | list[str]]:
761+
"""Parse uuid atoms and capture XMP data."""
762+
if len(atom_data) >= 16:
763+
uuid_bytes = atom_data[:16]
764+
content = atom_data[16:]
765+
766+
# Standard XMP UUID: BE7ACFCB-97A9-42E8-9C71-999491E3AFAC
767+
xmp_uuid = bytes.fromhex("BE7ACFCB97A942E89C71999491E3AFAC")
768+
769+
if (uuid_bytes == xmp_uuid or
770+
content.startswith(b"<?xpacket begin=") or
771+
b"<x:xmpmeta" in content or
772+
b'xmlns:x="adobe:ns:meta/"' in content):
773+
# Store XMP data for later processing
774+
self._xmp_data = content
775+
776+
return {} # uuid atoms don't directly set metadata fields
777+
778+
@classmethod
779+
def _parse_filename_metadata(
780+
cls, filename: str
781+
) -> dict[str, str | int | bool | list[str]]:
782+
"""Parse metadata from structured filename patterns."""
783+
import re
784+
metadata: dict[str, str | int | bool | list[str]] = {}
785+
786+
# Parse different filename patterns
787+
parts = [part.strip() for part in filename.split(" - ")]
788+
if len(parts) >= 4:
789+
# Format: artist - album - track_number - title
790+
metadata["artist"] = parts[0]
791+
metadata["album"] = parts[1]
792+
metadata["track"] = parts[2]
793+
metadata["title"] = parts[3]
794+
elif len(parts) == 3:
795+
# Format: artist - album - title or artist - album - track_number
796+
metadata["artist"] = parts[0]
797+
metadata["album"] = parts[1]
798+
third_part = parts[2]
799+
800+
# Check if third part is just a number (track number)
801+
if third_part.isdigit():
802+
metadata["track"] = third_part
803+
else:
804+
# Check if it starts with a track number
805+
track_match = re.match(r"^(\d+)\s*[-.]?\s*(.+)", third_part)
806+
if track_match:
807+
metadata["track"] = track_match.group(1)
808+
if track_match.group(2):
809+
metadata["title"] = track_match.group(2)
810+
else:
811+
metadata["title"] = third_part
812+
elif len(parts) == 2:
813+
# Format: artist - title
814+
metadata["artist"] = parts[0]
815+
title_part = parts[1]
816+
817+
# Check if title starts with track number
818+
track_match = re.match(r"^(\d+)\s*[-.]?\s*(.+)", title_part)
819+
if track_match:
820+
metadata["track"] = track_match.group(1)
821+
if track_match.group(2):
822+
metadata["title"] = track_match.group(2)
823+
else:
824+
metadata["title"] = title_part
825+
826+
return metadata
827+
828+
@classmethod
829+
def _parse_dublin_core_metadata(
830+
cls, xmp_text: str
831+
) -> dict[str, str | int | bool | list[str]]:
832+
"""Parse Dublin Core metadata from XMP text."""
833+
import re
834+
metadata: dict[str, str | int | bool | list[str]] = {}
835+
836+
# Look for standard Dublin Core metadata
837+
# dc:title → title
838+
dc_title_pattern = r"<dc:title[^>]*>.*?<rdf:li[^>]*>(.*?)</rdf:li>"
839+
title_matches = re.findall(dc_title_pattern, xmp_text, re.DOTALL)
840+
if title_matches:
841+
metadata["title"] = title_matches[0].strip()
842+
843+
# dc:creator → artist
844+
dc_creator_pattern = r"<dc:creator[^>]*>.*?<rdf:li[^>]*>(.*?)</rdf:li>"
845+
creator_matches = re.findall(dc_creator_pattern, xmp_text, re.DOTALL)
846+
if creator_matches:
847+
metadata["artist"] = creator_matches[0].strip()
848+
849+
# dc:date → year
850+
dc_date_pattern = r"<dc:date[^>]*>.*?<rdf:li[^>]*>(.*?)</rdf:li>"
851+
date_matches = re.findall(dc_date_pattern, xmp_text, re.DOTALL)
852+
if date_matches:
853+
year_match = re.match(r"(\d{4})", date_matches[0].strip())
854+
if year_match:
855+
metadata["year"] = year_match.group(1)
856+
857+
# dc:subject → comments
858+
dc_subject_pattern = r"<dc:subject[^>]*>.*?<rdf:li[^>]*>(.*?)</rdf:li>"
859+
subject_matches = re.findall(dc_subject_pattern, xmp_text, re.DOTALL)
860+
if subject_matches:
861+
metadata["comment"] = subject_matches[0].strip()
862+
863+
# dc:description → comments (overwrites subject)
864+
desc_pattern = r"<dc:description[^>]*>.*?<rdf:li[^>]*>(.*?)</rdf:li>"
865+
description_matches = re.findall(desc_pattern, xmp_text, re.DOTALL)
866+
if description_matches:
867+
metadata["comment"] = description_matches[0].strip()
868+
869+
return metadata
870+
871+
@classmethod
872+
def _parse_xmp_metadata(
873+
cls, xmp_content: bytes
874+
) -> dict[str, str | int | bool | list[str]]:
875+
"""Parse XMP content for music metadata fields."""
876+
if not xmp_content:
877+
return {}
878+
879+
try:
880+
# Decode XMP content
881+
xmp_text = xmp_content.decode("utf-8", errors="ignore")
882+
metadata: dict[str, str | int | bool | list[str]] = {}
883+
884+
# Look for file references that contain music metadata
885+
import re
886+
file_path_pattern = r'stRef:filePath="([^"]+)"'
887+
file_paths = re.findall(file_path_pattern, xmp_text)
888+
889+
for file_path in file_paths:
890+
# Check if it's an audio file
891+
audio_exts = [".wav", ".mp3", ".flac", ".aac", ".m4a"]
892+
if file_path and any(
893+
file_path.lower().endswith(ext) for ext in audio_exts
894+
):
895+
# Parse metadata from filename
896+
filename = file_path.split("/")[-1] # Get filename
897+
filename = re.sub(r"\.[^.]+$", "", filename) # Remove ext
898+
metadata.update(cls._parse_filename_metadata(filename))
899+
break # Use first audio file found
900+
901+
# Add Dublin Core metadata if filename parsing didn't work
902+
dublin_metadata = cls._parse_dublin_core_metadata(xmp_text)
903+
for key, value in dublin_metadata.items():
904+
if key not in metadata:
905+
metadata[key] = value
906+
907+
return metadata
908+
909+
except Exception:
910+
return {}
911+
702912
@classmethod
703913
def _read_extended_descriptor(cls, esds_atom: BinaryIO) -> None:
704914
for _i in range(4):

0 commit comments

Comments
 (0)