Source file src/net/http/sniff.go

     1  // Copyright 2011 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package http
     6  
     7  import (
     8  	"bytes"
     9  	"encoding/binary"
    10  )
    11  
    12  // The algorithm uses at most sniffLen bytes to make its decision.
    13  const sniffLen = 512
    14  
    15  // DetectContentType implements the algorithm described
    16  // at https://mimesniff.spec.whatwg.org/ to determine the
    17  // Content-Type of the given data. It considers at most the
    18  // first 512 bytes of data. DetectContentType always returns
    19  // a valid MIME type: if it cannot determine a more specific one, it
    20  // returns "application/octet-stream".
    21  func DetectContentType(data []byte) string {
    22  	if len(data) > sniffLen {
    23  		data = data[:sniffLen]
    24  	}
    25  
    26  	// Index of the first non-whitespace byte in data.
    27  	firstNonWS := 0
    28  	for ; firstNonWS < len(data) && isWS(data[firstNonWS]); firstNonWS++ {
    29  	}
    30  
    31  	for _, sig := range sniffSignatures {
    32  		if ct := sig.match(data, firstNonWS); ct != "" {
    33  			return ct
    34  		}
    35  	}
    36  
    37  	return "application/octet-stream" // fallback
    38  }
    39  
    40  // isWS reports whether the provided byte is a whitespace byte (0xWS)
    41  // as defined in https://mimesniff.spec.whatwg.org/#terminology.
    42  func isWS(b byte) bool {
    43  	switch b {
    44  	case '\t', '\n', '\x0c', '\r', ' ':
    45  		return true
    46  	}
    47  	return false
    48  }
    49  
    50  // isTT reports whether the provided byte is a tag-terminating byte (0xTT)
    51  // as defined in https://mimesniff.spec.whatwg.org/#terminology.
    52  func isTT(b byte) bool {
    53  	switch b {
    54  	case ' ', '>':
    55  		return true
    56  	}
    57  	return false
    58  }
    59  
    60  type sniffSig interface {
    61  	// match returns the MIME type of the data, or "" if unknown.
    62  	match(data []byte, firstNonWS int) string
    63  }
    64  
    65  // Data matching the table in section 6.
    66  var sniffSignatures = []sniffSig{
    67  	htmlSig("<!DOCTYPE HTML"),
    68  	htmlSig("<HTML"),
    69  	htmlSig("<HEAD"),
    70  	htmlSig("<SCRIPT"),
    71  	htmlSig("<IFRAME"),
    72  	htmlSig("<H1"),
    73  	htmlSig("<DIV"),
    74  	htmlSig("<FONT"),
    75  	htmlSig("<TABLE"),
    76  	htmlSig("<A"),
    77  	htmlSig("<STYLE"),
    78  	htmlSig("<TITLE"),
    79  	htmlSig("<B"),
    80  	htmlSig("<BODY"),
    81  	htmlSig("<BR"),
    82  	htmlSig("<P"),
    83  	htmlSig("<!--"),
    84  	&maskedSig{
    85  		mask:   []byte("\xFF\xFF\xFF\xFF\xFF"),
    86  		pat:    []byte("<?xml"),
    87  		skipWS: true,
    88  		ct:     "text/xml; charset=utf-8"},
    89  	&exactSig{[]byte("%PDF-"), "application/pdf"},
    90  	&exactSig{[]byte("%!PS-Adobe-"), "application/postscript"},
    91  
    92  	// UTF BOMs.
    93  	&maskedSig{
    94  		mask: []byte("\xFF\xFF\x00\x00"),
    95  		pat:  []byte("\xFE\xFF\x00\x00"),
    96  		ct:   "text/plain; charset=utf-16be",
    97  	},
    98  	&maskedSig{
    99  		mask: []byte("\xFF\xFF\x00\x00"),
   100  		pat:  []byte("\xFF\xFE\x00\x00"),
   101  		ct:   "text/plain; charset=utf-16le",
   102  	},
   103  	&maskedSig{
   104  		mask: []byte("\xFF\xFF\xFF\x00"),
   105  		pat:  []byte("\xEF\xBB\xBF\x00"),
   106  		ct:   "text/plain; charset=utf-8",
   107  	},
   108  
   109  	// Image types
   110  	// For posterity, we originally returned "image/vnd.microsoft.icon" from
   111  	// https://tools.ietf.org/html/draft-ietf-websec-mime-sniff-03#section-7
   112  	// https://codereview.appspot.com/4746042
   113  	// but that has since been replaced with "image/x-icon" in Section 6.2
   114  	// of https://mimesniff.spec.whatwg.org/#matching-an-image-type-pattern
   115  	&exactSig{[]byte("\x00\x00\x01\x00"), "image/x-icon"},
   116  	&exactSig{[]byte("\x00\x00\x02\x00"), "image/x-icon"},
   117  	&exactSig{[]byte("BM"), "image/bmp"},
   118  	&exactSig{[]byte("GIF87a"), "image/gif"},
   119  	&exactSig{[]byte("GIF89a"), "image/gif"},
   120  	&maskedSig{
   121  		mask: []byte("\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF"),
   122  		pat:  []byte("RIFF\x00\x00\x00\x00WEBPVP"),
   123  		ct:   "image/webp",
   124  	},
   125  	&exactSig{[]byte("\x89PNG\x0D\x0A\x1A\x0A"), "image/png"},
   126  	&exactSig{[]byte("\xFF\xD8\xFF"), "image/jpeg"},
   127  
   128  	// Audio and Video types
   129  	// Enforce the pattern match ordering as prescribed in
   130  	// https://mimesniff.spec.whatwg.org/#matching-an-audio-or-video-type-pattern
   131  	&maskedSig{
   132  		mask: []byte("\xFF\xFF\xFF\xFF"),
   133  		pat:  []byte(".snd"),
   134  		ct:   "audio/basic",
   135  	},
   136  	&maskedSig{
   137  		mask: []byte("\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF"),
   138  		pat:  []byte("FORM\x00\x00\x00\x00AIFF"),
   139  		ct:   "audio/aiff",
   140  	},
   141  	&maskedSig{
   142  		mask: []byte("\xFF\xFF\xFF"),
   143  		pat:  []byte("ID3"),
   144  		ct:   "audio/mpeg",
   145  	},
   146  	&maskedSig{
   147  		mask: []byte("\xFF\xFF\xFF\xFF\xFF"),
   148  		pat:  []byte("OggS\x00"),
   149  		ct:   "application/ogg",
   150  	},
   151  	&maskedSig{
   152  		mask: []byte("\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"),
   153  		pat:  []byte("MThd\x00\x00\x00\x06"),
   154  		ct:   "audio/midi",
   155  	},
   156  	&maskedSig{
   157  		mask: []byte("\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF"),
   158  		pat:  []byte("RIFF\x00\x00\x00\x00AVI "),
   159  		ct:   "video/avi",
   160  	},
   161  	&maskedSig{
   162  		mask: []byte("\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF"),
   163  		pat:  []byte("RIFF\x00\x00\x00\x00WAVE"),
   164  		ct:   "audio/wave",
   165  	},
   166  	// 6.2.0.2. video/mp4
   167  	mp4Sig{},
   168  	// 6.2.0.3. video/webm
   169  	&exactSig{[]byte("\x1A\x45\xDF\xA3"), "video/webm"},
   170  
   171  	// Font types
   172  	&maskedSig{
   173  		// 34 NULL bytes followed by the string "LP"
   174  		pat: []byte("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00LP"),
   175  		// 34 NULL bytes followed by \xF\xF
   176  		mask: []byte("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xFF\xFF"),
   177  		ct:   "application/vnd.ms-fontobject",
   178  	},
   179  	&exactSig{[]byte("\x00\x01\x00\x00"), "font/ttf"},
   180  	&exactSig{[]byte("OTTO"), "font/otf"},
   181  	&exactSig{[]byte("ttcf"), "font/collection"},
   182  	&exactSig{[]byte("wOFF"), "font/woff"},
   183  	&exactSig{[]byte("wOF2"), "font/woff2"},
   184  
   185  	// Archive types
   186  	&exactSig{[]byte("\x1F\x8B\x08"), "application/x-gzip"},
   187  	&exactSig{[]byte("PK\x03\x04"), "application/zip"},
   188  	// RAR's signatures are incorrectly defined by the MIME spec as per
   189  	//    https://github.com/whatwg/mimesniff/issues/63
   190  	// However, RAR Labs correctly defines it at:
   191  	//    https://www.rarlab.com/technote.htm#rarsign
   192  	// so we use the definition from RAR Labs.
   193  	// TODO: do whatever the spec ends up doing.
   194  	&exactSig{[]byte("Rar!\x1A\x07\x00"), "application/x-rar-compressed"},     // RAR v1.5-v4.0
   195  	&exactSig{[]byte("Rar!\x1A\x07\x01\x00"), "application/x-rar-compressed"}, // RAR v5+
   196  
   197  	&exactSig{[]byte("\x00\x61\x73\x6D"), "application/wasm"},
   198  
   199  	textSig{}, // should be last
   200  }
   201  
   202  type exactSig struct {
   203  	sig []byte
   204  	ct  string
   205  }
   206  
   207  func (e *exactSig) match(data []byte, firstNonWS int) string {
   208  	if bytes.HasPrefix(data, e.sig) {
   209  		return e.ct
   210  	}
   211  	return ""
   212  }
   213  
   214  type maskedSig struct {
   215  	mask, pat []byte
   216  	skipWS    bool
   217  	ct        string
   218  }
   219  
   220  func (m *maskedSig) match(data []byte, firstNonWS int) string {
   221  	// pattern matching algorithm section 6
   222  	// https://mimesniff.spec.whatwg.org/#pattern-matching-algorithm
   223  
   224  	if m.skipWS {
   225  		data = data[firstNonWS:]
   226  	}
   227  	if len(m.pat) != len(m.mask) {
   228  		return ""
   229  	}
   230  	if len(data) < len(m.pat) {
   231  		return ""
   232  	}
   233  	for i, pb := range m.pat {
   234  		maskedData := data[i] & m.mask[i]
   235  		if maskedData != pb {
   236  			return ""
   237  		}
   238  	}
   239  	return m.ct
   240  }
   241  
   242  type htmlSig []byte
   243  
   244  func (h htmlSig) match(data []byte, firstNonWS int) string {
   245  	data = data[firstNonWS:]
   246  	if len(data) < len(h)+1 {
   247  		return ""
   248  	}
   249  	for i, b := range h {
   250  		db := data[i]
   251  		if 'A' <= b && b <= 'Z' {
   252  			db &= 0xDF
   253  		}
   254  		if b != db {
   255  			return ""
   256  		}
   257  	}
   258  	// Next byte must be a tag-terminating byte(0xTT).
   259  	if !isTT(data[len(h)]) {
   260  		return ""
   261  	}
   262  	return "text/html; charset=utf-8"
   263  }
   264  
   265  var mp4ftype = []byte("ftyp")
   266  var mp4 = []byte("mp4")
   267  
   268  type mp4Sig struct{}
   269  
   270  func (mp4Sig) match(data []byte, firstNonWS int) string {
   271  	// https://mimesniff.spec.whatwg.org/#signature-for-mp4
   272  	// c.f. section 6.2.1
   273  	if len(data) < 12 {
   274  		return ""
   275  	}
   276  	boxSize := int(binary.BigEndian.Uint32(data[:4]))
   277  	if len(data) < boxSize || boxSize%4 != 0 {
   278  		return ""
   279  	}
   280  	if !bytes.Equal(data[4:8], mp4ftype) {
   281  		return ""
   282  	}
   283  	for st := 8; st < boxSize; st += 4 {
   284  		if st == 12 {
   285  			// Ignores the four bytes that correspond to the version number of the "major brand".
   286  			continue
   287  		}
   288  		if bytes.Equal(data[st:st+3], mp4) {
   289  			return "video/mp4"
   290  		}
   291  	}
   292  	return ""
   293  }
   294  
   295  type textSig struct{}
   296  
   297  func (textSig) match(data []byte, firstNonWS int) string {
   298  	// c.f. section 5, step 4.
   299  	for _, b := range data[firstNonWS:] {
   300  		switch {
   301  		case b <= 0x08,
   302  			b == 0x0B,
   303  			0x0E <= b && b <= 0x1A,
   304  			0x1C <= b && b <= 0x1F:
   305  			return ""
   306  		}
   307  	}
   308  	return "text/plain; charset=utf-8"
   309  }
   310  

View as plain text