Files
sdl/internal/downloader/downloader.go
2025-11-18 13:07:24 +01:00

677 lines
18 KiB
Go

package downloader
import (
"bufio"
"bytes"
"context"
"errors"
"fmt"
"io"
"net/http"
"net/url"
"os"
"os/exec"
"path"
"path/filepath"
"regexp"
"sort"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/grafov/m3u8"
)
const (
defaultClientTimeout = 30 * time.Second
vodSegmentTimeout = 5 * time.Minute // Longer timeout for VOD segments which can be large
maxRedirects = 5
playlistPollInterval = 2 * time.Second // Poll interval for livestreams
)
var (
errInvalidPlaylist = errors.New("unsupported playlist type. expected media playlist")
errUnsupportedMaster = errors.New("master playlist contains no playable variants")
errFFmpegMissing = errors.New("ffmpeg is required on PATH to transmux segments")
errNoStreamsFound = errors.New("no HLS playlists found on page")
)
type segmentInfo struct {
Sequence uint64
URI string
}
// Download downloads the video stream(s) referenced by the given URL into files derived from outputName.
// If outputName is empty, a base name is inferred from the source URL.
func Download(ctx context.Context, sourceURL, outputName string) error {
if sourceURL == "" {
return errors.New("stream URL must not be empty")
}
parsed, err := url.Parse(sourceURL)
if err != nil {
return fmt.Errorf("invalid URL: %w", err)
}
client := &http.Client{Timeout: defaultClientTimeout}
contentType, err := peekContentType(ctx, client, parsed)
if err != nil {
return fmt.Errorf("detect content type: %w", err)
}
if strings.Contains(contentType, "text/html") {
streams, err := extractPlaylistsFromHTML(ctx, client, parsed)
if err != nil {
return err
}
if len(streams) == 0 {
return errNoStreamsFound
}
for idx, stream := range streams {
name := outputName
if name == "" {
name = inferOutputNameMust(stream)
}
if len(streams) > 1 {
ext := filepath.Ext(name)
base := strings.TrimSuffix(name, ext)
if base == "" {
base = fmt.Sprintf("video-%d", idx+1)
}
name = base + fmt.Sprintf("-%02d", idx+1) + ext
}
if err := Download(ctx, stream, name); err != nil {
return err
}
}
return nil
}
// Use the original context for VOD downloads
// For livestreams, a cancellable context will be created in downloadPlaylist
return downloadPlaylist(ctx, client, parsed, outputName)
}
// watchForQuitKey watches for "q" keypress and cancels the context
func watchForQuitKey(cancel context.CancelFunc) {
// Check if stdin is a terminal
if !isTerminal(os.Stdin) {
return // Not a terminal, skip keyboard input
}
// Print instruction once
fmt.Fprintf(os.Stderr, "Press 'q' and Enter to stop downloading and convert to MP4\n")
reader := bufio.NewReader(os.Stdin)
for {
line, err := reader.ReadString('\n')
if err != nil {
// EOF or error, stop watching
return
}
// Check if line starts with 'q' or 'Q' (allows "q" or "q\n" or "quit\n", etc.)
line = strings.TrimSpace(line)
if len(line) > 0 && (line[0] == 'q' || line[0] == 'Q') {
fmt.Fprintf(os.Stderr, "\nStopping download and converting to MP4...\n")
cancel()
return
}
}
}
// isTerminal checks if the file descriptor is a terminal
func isTerminal(f *os.File) bool {
stat, err := f.Stat()
if err != nil {
return false
}
return (stat.Mode() & os.ModeCharDevice) != 0
}
func downloadPlaylist(ctx context.Context, client *http.Client, parsed *url.URL, outputName string) error {
body, finalURL, err := fetchWithRedirects(ctx, client, parsed, maxRedirects)
if err != nil {
return fmt.Errorf("fetch playlist: %w", err)
}
defer body.Close()
playlist, listType, err := m3u8.DecodeFrom(bufio.NewReader(body), true)
if err != nil {
return fmt.Errorf("parse playlist: %w", err)
}
var mediaPlaylist *m3u8.MediaPlaylist
switch listType {
case m3u8.MEDIA:
mp, ok := playlist.(*m3u8.MediaPlaylist)
if !ok {
return errInvalidPlaylist
}
mediaPlaylist = mp
case m3u8.MASTER:
masterPlaylist, ok := playlist.(*m3u8.MasterPlaylist)
if !ok {
return errInvalidPlaylist
}
variantURL, err := selectVariant(masterPlaylist, finalURL)
if err != nil {
return err
}
return Download(ctx, variantURL, outputName)
default:
return errInvalidPlaylist
}
if len(mediaPlaylist.Segments) == 0 {
return errors.New("playlist contains no segments")
}
name := outputName
if name == "" {
name = inferOutputName(finalURL)
}
// Check if this is a livestream (playlist is not closed)
isLiveStream := !mediaPlaylist.Closed
if isLiveStream {
// For livestreams, create a cancellable context for keyboard input
liveCtx, cancel := context.WithCancel(ctx)
defer cancel()
// Start keyboard input handler in a goroutine for livestreams only
go watchForQuitKey(cancel)
// For livestreams, write to a permanent .ts file
tsName := ensureTSExtension(name)
tsFile, err := os.Create(tsName)
if err != nil {
return fmt.Errorf("create output file: %w", err)
}
// downloadLiveStream handles file closing
return downloadLiveStream(liveCtx, client, finalURL, mediaPlaylist, tsFile, name)
}
// For VOD (Video on Demand), use temp file and convert to MP4
// Create a context without deadline for VOD downloads to avoid premature cancellation
// Preserve cancellation from parent context but remove any deadline
vodCtx, vodCancel := context.WithCancel(context.Background())
defer vodCancel()
// If parent context is cancelled, cancel VOD context too
go func() {
<-ctx.Done()
vodCancel()
}()
// Create a client with longer timeout for VOD segments
vodClient := &http.Client{Timeout: vodSegmentTimeout}
tempTS, err := os.CreateTemp("", "sdl-*.ts")
if err != nil {
return fmt.Errorf("create temp file: %w", err)
}
tempPath := tempTS.Name()
defer func() {
tempTS.Close()
os.Remove(tempPath)
}()
segments := collectSegments(mediaPlaylist)
totalSegments := len(segments)
fmt.Fprintf(os.Stderr, "Downloading %d segments...\n", totalSegments)
for i, segment := range segments {
if err := downloadSegment(vodCtx, vodClient, finalURL, segment, tempTS); err != nil {
return fmt.Errorf("download segment %d: %w", i, err)
}
// Show progress: segment number, total, and percentage
progress := float64(i+1) / float64(totalSegments) * 100
fmt.Fprintf(os.Stderr, "\rProgress: %d/%d segments (%.1f%%)", i+1, totalSegments, progress)
os.Stderr.Sync() // Flush progress output
}
fmt.Fprintf(os.Stderr, "\n")
if _, err := tempTS.Seek(0, io.SeekStart); err != nil {
return fmt.Errorf("rewind temp file: %w", err)
}
mp4Name := ensureMP4Extension(name)
fmt.Fprintf(os.Stderr, "Converting to MP4...\n")
if err := transmuxToMP4(vodCtx, tempPath, mp4Name); err != nil {
return err
}
fmt.Fprintf(os.Stderr, "Complete: %s\n", mp4Name)
return nil
}
func downloadLiveStream(ctx context.Context, client *http.Client, playlistURL *url.URL, initialPlaylist *m3u8.MediaPlaylist, output *os.File, outputName string) error {
// Track downloaded segments by sequence number
downloadedSeqs := make(map[uint64]bool)
tsPath := output.Name()
// Download initial segments
initialSegments := collectSegments(initialPlaylist)
for _, segment := range initialSegments {
if err := downloadSegment(ctx, client, playlistURL, segment, output); err != nil {
return fmt.Errorf("download initial segment %d: %w", segment.Sequence, err)
}
// Sync after each segment to ensure data is written to disk
if err := output.Sync(); err != nil {
return fmt.Errorf("sync file after segment %d: %w", segment.Sequence, err)
}
downloadedSeqs[segment.Sequence] = true
}
// Continuously poll for new segments
ticker := time.NewTicker(playlistPollInterval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
// Context cancelled (Ctrl+C or "q" key) - finalize and convert
return finalizeAndConvert(output, tsPath, outputName)
case <-ticker.C:
// Fetch updated playlist
body, _, err := fetchWithRedirects(ctx, client, playlistURL, maxRedirects)
if err != nil {
// If we can't fetch the playlist, continue trying
continue
}
playlist, listType, err := m3u8.DecodeFrom(bufio.NewReader(body), true)
body.Close()
if err != nil {
continue
}
if listType != m3u8.MEDIA {
continue
}
mediaPlaylist, ok := playlist.(*m3u8.MediaPlaylist)
if !ok {
continue
}
// Check if stream has ended
if mediaPlaylist.Closed {
// Download any remaining segments
segments := collectSegments(mediaPlaylist)
for _, segment := range segments {
if !downloadedSeqs[segment.Sequence] {
if err := downloadSegment(ctx, client, playlistURL, segment, output); err != nil {
// Log error but continue with finalization
continue
}
// Sync after each segment
if err := output.Sync(); err != nil {
continue
}
downloadedSeqs[segment.Sequence] = true
}
}
// Stream ended - finalize and convert
return finalizeAndConvert(output, tsPath, outputName)
}
// Download new segments
segments := collectSegments(mediaPlaylist)
for _, segment := range segments {
if !downloadedSeqs[segment.Sequence] {
if err := downloadSegment(ctx, client, playlistURL, segment, output); err != nil {
// Log error but continue polling
continue
}
// Sync after each segment to ensure data is written to disk
if err := output.Sync(); err != nil {
// Log error but continue
continue
}
downloadedSeqs[segment.Sequence] = true
}
}
}
}
}
func finalizeOutput(ctx context.Context, tempPath, outputName string) error {
mp4Name := ensureMP4Extension(outputName)
if err := transmuxToMP4(ctx, tempPath, mp4Name); err != nil {
return err
}
// Clean up temp file after successful transmux
os.Remove(tempPath)
return nil
}
func finalizeAndConvert(output *os.File, tsPath, outputName string) error {
// Sync and close the file before conversion
output.Sync() // Flush any pending writes
output.Close()
// Convert to MP4
fmt.Fprintf(os.Stderr, "Converting to MP4...\n")
if err := convertTSToMP4(context.Background(), tsPath, outputName); err != nil {
// Don't fail if conversion fails - the .ts file is still valid
fmt.Fprintf(os.Stderr, "Warning: MP4 conversion failed, but .ts file is saved: %v\n", err)
return nil
}
mp4Name := ensureMP4Extension(outputName)
fmt.Fprintf(os.Stderr, "Conversion complete: %s\n", mp4Name)
return nil
}
func convertTSToMP4(ctx context.Context, tsPath, outputName string) error {
// Only convert if ffmpeg is available, but don't fail if it's not
if _, err := exec.LookPath("ffmpeg"); err != nil {
return fmt.Errorf("ffmpeg not found in PATH")
}
mp4Name := ensureMP4Extension(outputName)
// Use background context for conversion to ensure it completes even if main context is cancelled
if err := transmuxToMP4(ctx, tsPath, mp4Name); err != nil {
return err
}
return nil
}
func peekContentType(ctx context.Context, client *http.Client, target *url.URL) (string, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodHead, target.String(), nil)
if err != nil {
return "", fmt.Errorf("create HEAD request: %w", err)
}
resp, err := client.Do(req)
if err != nil {
return "", fmt.Errorf("head request: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode >= 400 {
return "", fmt.Errorf("head status %s", resp.Status)
}
return resp.Header.Get("Content-Type"), nil
}
func extractPlaylistsFromHTML(ctx context.Context, client *http.Client, target *url.URL) ([]string, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, target.String(), nil)
if err != nil {
return nil, fmt.Errorf("create request: %w", err)
}
resp, err := client.Do(req)
if err != nil {
return nil, fmt.Errorf("fetch page: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("unexpected status %s", resp.Status)
}
doc, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
return nil, fmt.Errorf("parse html: %w", err)
}
results := make([]string, 0)
doc.Find("video,source").Each(func(_ int, sel *goquery.Selection) {
if src, ok := sel.Attr("src"); ok {
if resolved := resolveURL(target, src); resolved != "" {
results = append(results, resolved)
}
}
})
attrPatterns := []string{"data-src", "data-hls", "data-hls-src"}
doc.Find("*[data-src], *[data-hls], *[data-hls-src]").Each(func(_ int, sel *goquery.Selection) {
for _, attr := range attrPatterns {
if val, ok := sel.Attr(attr); ok {
if resolved := resolveURL(target, val); resolved != "" {
results = append(results, resolved)
}
}
}
})
hrefPattern := regexp.MustCompile(`(?i)\.m3u8(\?.*)?$`)
doc.Find("a[href], link[href]").Each(func(_ int, sel *goquery.Selection) {
if href, ok := sel.Attr("href"); ok && hrefPattern.MatchString(href) {
if resolved := resolveURL(target, href); resolved != "" {
results = append(results, resolved)
}
}
})
results = append(results, findPlaylistInScripts(target, doc)...)
uniq := uniqueStrings(results)
return uniq, nil
}
func resolveURL(baseURL *url.URL, ref string) string {
if ref == "" {
return ""
}
if strings.HasPrefix(ref, "data:") {
return ""
}
parsed, err := baseURL.Parse(ref)
if err != nil {
return ""
}
return parsed.String()
}
func findPlaylistInScripts(baseURL *url.URL, doc *goquery.Document) []string {
var results []string
pattern := regexp.MustCompile(`https?://[^"'\\\s]+\.m3u8[^"'\\\s]*`)
doc.Find("script").Each(func(_ int, sel *goquery.Selection) {
text := sel.Text()
for _, match := range pattern.FindAllString(text, -1) {
if resolved := resolveURL(baseURL, match); resolved != "" {
results = append(results, resolved)
}
}
})
return results
}
func uniqueStrings(in []string) []string {
seen := make(map[string]struct{}, len(in))
out := make([]string, 0, len(in))
for _, item := range in {
if item == "" {
continue
}
if _, exists := seen[item]; exists {
continue
}
seen[item] = struct{}{}
out = append(out, item)
}
return out
}
func collectSegments(playlist *m3u8.MediaPlaylist) []segmentInfo {
segments := make([]segmentInfo, 0, len(playlist.Segments))
for _, segment := range playlist.Segments {
if segment == nil || segment.URI == "" {
continue
}
segments = append(segments, segmentInfo{Sequence: segment.SeqId, URI: segment.URI})
}
sort.Slice(segments, func(i, j int) bool {
return segments[i].Sequence < segments[j].Sequence
})
return segments
}
func selectVariant(master *m3u8.MasterPlaylist, base *url.URL) (string, error) {
var chosen *m3u8.Variant
for _, variant := range master.Variants {
if variant == nil || variant.URI == "" {
continue
}
if chosen == nil || variant.Bandwidth > chosen.Bandwidth {
chosen = variant
}
}
if chosen == nil {
return "", errUnsupportedMaster
}
resolved, err := base.Parse(chosen.URI)
if err != nil {
return "", fmt.Errorf("parse variant URL: %w", err)
}
return resolved.String(), nil
}
func downloadSegment(ctx context.Context, client *http.Client, base *url.URL, segment segmentInfo, output io.Writer) error {
segmentURL, err := base.Parse(segment.URI)
if err != nil {
return fmt.Errorf("parse segment URL: %w", err)
}
req, err := http.NewRequestWithContext(ctx, http.MethodGet, segmentURL.String(), nil)
if err != nil {
return fmt.Errorf("create request: %w", err)
}
resp, err := client.Do(req)
if err != nil {
return fmt.Errorf("fetch segment: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return fmt.Errorf("unexpected status %s", resp.Status)
}
if _, err := io.Copy(output, resp.Body); err != nil {
return fmt.Errorf("write segment: %w", err)
}
return nil
}
func fetchWithRedirects(ctx context.Context, client *http.Client, streamURL *url.URL, redirects int) (io.ReadCloser, *url.URL, error) {
currentURL := streamURL
for count := 0; count <= redirects; count++ {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, currentURL.String(), nil)
if err != nil {
return nil, nil, fmt.Errorf("create request: %w", err)
}
resp, err := client.Do(req)
if err != nil {
return nil, nil, fmt.Errorf("http get: %w", err)
}
switch {
case resp.StatusCode >= 300 && resp.StatusCode < 400:
location := resp.Header.Get("Location")
resp.Body.Close()
if location == "" {
return nil, nil, errors.New("redirect without location header")
}
nextURL, err := currentURL.Parse(location)
if err != nil {
return nil, nil, fmt.Errorf("parse redirect URL: %w", err)
}
currentURL = nextURL
continue
case resp.StatusCode == http.StatusOK:
return resp.Body, currentURL, nil
default:
resp.Body.Close()
return nil, nil, fmt.Errorf("unexpected status %s", resp.Status)
}
}
return nil, nil, errors.New("too many redirects")
}
func inferOutputName(streamURL *url.URL) string {
base := path.Base(streamURL.Path)
if base == "." || base == "/" || base == "" {
base = "download"
}
trimmed := strings.TrimSuffix(base, path.Ext(base))
if trimmed == "" {
trimmed = "download"
}
return trimmed + ".mp4"
}
func ensureMP4Extension(name string) string {
lower := strings.ToLower(name)
if strings.HasSuffix(lower, ".mp4") {
return name
}
ext := filepath.Ext(name)
trimmed := strings.TrimSuffix(name, ext)
if trimmed == "" {
trimmed = "output"
}
return trimmed + ".mp4"
}
func ensureTSExtension(name string) string {
lower := strings.ToLower(name)
if strings.HasSuffix(lower, ".ts") {
return name
}
ext := filepath.Ext(name)
trimmed := strings.TrimSuffix(name, ext)
if trimmed == "" {
trimmed = "output"
}
return trimmed + ".ts"
}
func inferOutputNameMust(rawURL string) string {
parsed, err := url.Parse(rawURL)
if err != nil {
return ensureMP4Extension("download")
}
return inferOutputName(parsed)
}
func transmuxToMP4(ctx context.Context, tsPath, mp4Path string) error {
if _, err := exec.LookPath("ffmpeg"); err != nil {
return errFFmpegMissing
}
if err := os.MkdirAll(filepath.Dir(mp4Path), 0o755); err != nil {
return fmt.Errorf("ensure output directory: %w", err)
}
cmd := exec.CommandContext(ctx, "ffmpeg", "-y", "-i", tsPath, "-c", "copy", "-movflags", "+faststart", mp4Path)
cmd.Stdout = io.Discard
var stderr bytes.Buffer
cmd.Stderr = &stderr
if err := cmd.Run(); err != nil {
os.Remove(mp4Path)
msg := strings.TrimSpace(stderr.String())
if msg != "" {
return fmt.Errorf("ffmpeg: %s", msg)
}
return fmt.Errorf("ffmpeg: %w", err)
}
return nil
}