diff --git a/.eslintrc.js b/.eslintrc.js index 7939452..d62f0d6 100644 --- a/.eslintrc.js +++ b/.eslintrc.js @@ -18,6 +18,8 @@ module.exports = { }, plugins: ['@typescript-eslint', 'import', 'prettier'], rules: { + 'no-plusplus': 'off', + 'no-bitwise': 'off', 'no-underscore-dangle': 'off', '@typescript-eslint/no-explicit-any': 'off', 'no-console': 'off', diff --git a/README.md b/README.md index d30d021..0042142 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,7 @@ The following CLI Mode arguments are available | `--season` | `-s` | Season number. Only used if type is `show` | `0` | | `--episode` | `-e` | Episode number. Only used if type is `show` | `0` | | `--url` | `-u` | URL to a video embed. Only used if source is an embed | | +| `--headers` | `-h` | Optional headers to send while scraping | | | `--help` | `-h` | Shows help for the command arguments | | Example testing the FlixHQ source on the movie "Spirited Away" diff --git a/src/dev-cli.ts b/src/dev-cli.ts index 54d663a..57b56f0 100644 --- a/src/dev-cli.ts +++ b/src/dev-cli.ts @@ -39,6 +39,7 @@ type CommandLineArguments = { season: string; episode: string; url: string; + headers?: Record; }; const TMDB_API_KEY = process.env.MOVIE_WEB_TMDB_API_KEY ?? ''; @@ -185,6 +186,7 @@ async function runScraper(providers: ProviderControls, source: MetaOutput, optio const result = await providers.runEmbedScraper({ url: options.url, id: source.id, + headers: options.headers, }); spinnies.succeed('scrape', { text: 'Done!' }); logDeepObject(result); @@ -279,6 +281,10 @@ async function processOptions(options: CommandLineArguments) { } } + if (typeof options.headers === 'string') { + options.headers = JSON.parse(options.headers); + } + let fetcher; if (options.fetcher === 'native') { @@ -409,7 +415,8 @@ async function runCommandLine() { .option('-t, --type ', "Media type. Either 'movie' or 'show'. Only used if source is a provider", 'movie') .option('-s, --season ', "Season number. Only used if type is 'show'", '0') .option('-e, --episode ', "Episode number. Only used if type is 'show'", '0') - .option('-u, --url ', 'URL to a video embed. Only used if source is an embed', ''); + .option('-u, --url ', 'URL to a video embed. Only used if source is an embed', '') + .option('-h, --headers ', 'Optional headers to pass to scrapers. JSON encoded'); program.parse(); diff --git a/src/fetchers/common.ts b/src/fetchers/common.ts index e31b6d1..a2b77aa 100644 --- a/src/fetchers/common.ts +++ b/src/fetchers/common.ts @@ -34,6 +34,7 @@ export function makeFullFetcher(fetcher: Fetcher): UseableFetcher { query: ops?.query ?? {}, baseUrl: ops?.baseUrl ?? '', body: ops?.body, + returnRaw: ops?.returnRaw ?? false, }); }; } diff --git a/src/fetchers/fetch.ts b/src/fetchers/fetch.ts index 1d419f0..8311fb2 100644 --- a/src/fetchers/fetch.ts +++ b/src/fetchers/fetch.ts @@ -17,6 +17,7 @@ export type FetchReply = { text(): Promise; json(): Promise; headers: FetchHeaders; + url: string; }; export type FetchLike = (url: string, ops?: FetchOps | undefined) => Promise; diff --git a/src/fetchers/standardFetch.ts b/src/fetchers/standardFetch.ts index dd84893..bdf14d1 100644 --- a/src/fetchers/standardFetch.ts +++ b/src/fetchers/standardFetch.ts @@ -17,6 +17,10 @@ export function makeStandardFetcher(f: FetchLike): Fetcher { body: seralizedBody.body, }); + if (ops.returnRaw) { + return res; + } + const isJson = res.headers.get('content-type')?.includes('application/json'); if (isJson) return res.json(); return res.text(); diff --git a/src/fetchers/types.ts b/src/fetchers/types.ts index 2d14748..8e581a6 100644 --- a/src/fetchers/types.ts +++ b/src/fetchers/types.ts @@ -4,8 +4,9 @@ export type FetcherOptions = { baseUrl?: string; headers?: Record; query?: Record; - method?: 'GET' | 'POST'; + method?: 'HEAD' | 'GET' | 'POST'; body?: Record | string | FormData | URLSearchParams; + returnRaw?: boolean; }; export type DefaultedFetcherOptions = { @@ -13,7 +14,8 @@ export type DefaultedFetcherOptions = { body?: Record | string | FormData; headers: Record; query: Record; - method: 'GET' | 'POST'; + method: 'HEAD' | 'GET' | 'POST'; + returnRaw: boolean; }; export type Fetcher = { diff --git a/src/main/builder.ts b/src/main/builder.ts index 0322dbd..a02f298 100644 --- a/src/main/builder.ts +++ b/src/main/builder.ts @@ -57,6 +57,9 @@ export interface EmbedRunnerOptions { // id of the embed scraper you want to scrape from id: string; + + // optional headers for the embed scraper to use + headers?: Record; } export interface ProviderControls { diff --git a/src/main/individualRunner.ts b/src/main/individualRunner.ts index ac563ea..d37f333 100644 --- a/src/main/individualRunner.ts +++ b/src/main/individualRunner.ts @@ -66,6 +66,7 @@ export type IndividualEmbedRunnerOptions = { url: string; id: string; events?: IndividualScraperEvents; + headers?: Record; }; export async function scrapeIndividualEmbed( @@ -79,6 +80,7 @@ export async function scrapeIndividualEmbed( fetcher: ops.fetcher, proxiedFetcher: ops.proxiedFetcher, url: ops.url, + headers: ops.headers, progress(val) { ops.events?.update?.({ id: embedScraper.id, diff --git a/src/providers/all.ts b/src/providers/all.ts index 847ebf1..533b1ba 100644 --- a/src/providers/all.ts +++ b/src/providers/all.ts @@ -2,15 +2,18 @@ import { Embed, Sourcerer } from '@/providers/base'; import { febBoxScraper } from '@/providers/embeds/febBox'; import { mixdropScraper } from '@/providers/embeds/mixdrop'; import { mp4uploadScraper } from '@/providers/embeds/mp4upload'; +import { streambucketScraper } from '@/providers/embeds/streambucket'; import { streamsbScraper } from '@/providers/embeds/streamsb'; import { upcloudScraper } from '@/providers/embeds/upcloud'; import { upstreamScraper } from '@/providers/embeds/upstream'; +import { vidsrcembedScraper } from '@/providers/embeds/vidsrc'; import { flixhqScraper } from '@/providers/sources/flixhq/index'; import { goMoviesScraper } from '@/providers/sources/gomovies/index'; import { kissAsianScraper } from '@/providers/sources/kissasian/index'; import { lookmovieScraper } from '@/providers/sources/lookmovie'; import { remotestreamScraper } from '@/providers/sources/remotestream'; import { superStreamScraper } from '@/providers/sources/superstream/index'; +import { vidsrcScraper } from '@/providers/sources/vidsrc'; import { zoechipScraper } from '@/providers/sources/zoechip'; import { smashyStreamDScraper } from './embeds/smashystream/dued'; @@ -27,6 +30,7 @@ export function gatherAllSources(): Array { superStreamScraper, goMoviesScraper, zoechipScraper, + vidsrcScraper, lookmovieScraper, showBoxScraper, smashyStreamScraper, @@ -40,8 +44,10 @@ export function gatherAllEmbeds(): Array { mp4uploadScraper, streamsbScraper, upstreamScraper, - febBoxScraper, mixdropScraper, + vidsrcembedScraper, + streambucketScraper, + febBoxScraper, smashyStreamFScraper, smashyStreamDScraper, ]; diff --git a/src/providers/base.ts b/src/providers/base.ts index 022cd87..6a26fa8 100644 --- a/src/providers/base.ts +++ b/src/providers/base.ts @@ -5,6 +5,7 @@ import { EmbedScrapeContext, MovieScrapeContext, ShowScrapeContext } from '@/uti export type SourcererEmbed = { embedId: string; url: string; + headers?: Record; }; export type SourcererOutput = { diff --git a/src/providers/embeds/streambucket.ts b/src/providers/embeds/streambucket.ts new file mode 100644 index 0000000..f137ceb --- /dev/null +++ b/src/providers/embeds/streambucket.ts @@ -0,0 +1,98 @@ +import { flags } from '@/main/targets'; +import { makeEmbed } from '@/providers/base'; + +// StreamBucket makes use of https://github.com/nicxlau/hunter-php-javascript-obfuscator + +const hunterRegex = /eval\(function\(h,u,n,t,e,r\).*?\("(.*?)",\d*?,"(.*?)",(\d*?),(\d*?),\d*?\)\)/; +const linkRegex = /file:"(.*?)"/; + +// This is a much more simple and optimized version of the "h,u,n,t,e,r" +// obfuscation algorithm. It's just basic chunked+mask encoding. +// I have seen this same encoding used on some sites under the name +// "p,l,a,y,e,r" as well +function decodeHunter(encoded: string, mask: string, charCodeOffset: number, delimiterOffset: number) { + // The encoded string is made up of 'n' number of chunks. + // Each chunk is separated by a delimiter inside the mask. + // This offset is also used as the exponentiation base in + // the charCode calculations + const delimiter = mask[delimiterOffset]; + + // Split the 'encoded' string into chunks using the delimiter, + // and filter out any empty chunks. + const chunks = encoded.split(delimiter).filter((chunk) => chunk); + + // Decode each chunk and concatenate the results to form the final 'decoded' string. + const decoded = chunks + .map((chunk) => { + // Chunks are in reverse order. 'reduceRight' removes the + // need to 'reverse' the array first + const charCode = chunk.split('').reduceRight((c, value, index) => { + // Calculate the character code for each character in the chunk. + // This involves finding the index of 'value' in the 'mask' and + // multiplying it by (delimiterOffset^position). + return c + mask.indexOf(value) * delimiterOffset ** (chunk.length - 1 - index); + }, 0); + + // The actual character code is offset by the given amount + return String.fromCharCode(charCode - charCodeOffset); + }) + .join(''); + + return decoded; +} + +export const streambucketScraper = makeEmbed({ + id: 'streambucket', + name: 'StreamBucket', + rank: 196, + // TODO - Disabled until ctx.fetcher and ctx.proxiedFetcher don't trigger bot detection + disabled: true, + async scrape(ctx) { + // Using the context fetchers make the site return just the string "No bots please!"? + // TODO - Fix this. Native fetch does not trigger this. No idea why right now + const response = await fetch(ctx.url); + const html = await response.text(); + + // This is different than the above mentioned bot detection + if (html.includes('captcha-checkbox')) { + // TODO - This doesn't use recaptcha, just really basic "image match". Maybe could automate? + throw new Error('StreamBucket got captchaed'); + } + + let regexResult = html.match(hunterRegex); + + if (!regexResult) { + throw new Error('Failed to find StreamBucket hunter JavaScript'); + } + + const encoded = regexResult[1]; + const mask = regexResult[2]; + const charCodeOffset = Number(regexResult[3]); + const delimiterOffset = Number(regexResult[4]); + + if (Number.isNaN(charCodeOffset)) { + throw new Error('StreamBucket hunter JavaScript charCodeOffset is not a valid number'); + } + + if (Number.isNaN(delimiterOffset)) { + throw new Error('StreamBucket hunter JavaScript delimiterOffset is not a valid number'); + } + + const decoded = decodeHunter(encoded, mask, charCodeOffset, delimiterOffset); + + regexResult = decoded.match(linkRegex); + + if (!regexResult) { + throw new Error('Failed to find StreamBucket HLS link'); + } + + return { + stream: { + type: 'hls', + playlist: regexResult[1], + flags: [flags.NO_CORS], + captions: [], + }, + }; + }, +}); diff --git a/src/providers/embeds/vidsrc.ts b/src/providers/embeds/vidsrc.ts new file mode 100644 index 0000000..da3f2aa --- /dev/null +++ b/src/providers/embeds/vidsrc.ts @@ -0,0 +1,35 @@ +import { makeEmbed } from '@/providers/base'; + +const hlsURLRegex = /file:"(.*?)"/; + +export const vidsrcembedScraper = makeEmbed({ + id: 'vidsrcembed', // VidSrc is both a source and an embed host + name: 'VidSrc', + rank: 197, + async scrape(ctx) { + if (!ctx.headers || (!ctx.headers.referer && !ctx.headers.Referer)) { + throw new Error('VidSrc embeds require the referer header to be set'); + } + const html = await ctx.proxiedFetcher(ctx.url, { + headers: ctx.headers, + }); + + const match = html + .match(hlsURLRegex)?.[1] + ?.replace(/(\/\/\S+?=)/g, '') + .replace('#2', ''); + if (!match) throw new Error('Unable to find HLS playlist'); + const finalUrl = atob(match); + + if (!finalUrl.includes('.m3u8')) throw new Error('Unable to find HLS playlist'); + + return { + stream: { + type: 'hls', + playlist: finalUrl, + flags: [], + captions: [], + }, + }; + }, +}); diff --git a/src/providers/sources/vidsrc/common.ts b/src/providers/sources/vidsrc/common.ts new file mode 100644 index 0000000..4ccc93c --- /dev/null +++ b/src/providers/sources/vidsrc/common.ts @@ -0,0 +1,2 @@ +export const vidsrcBase = 'https://vidsrc.me'; +export const vidsrcRCPBase = 'https://rcp.vidsrc.me'; diff --git a/src/providers/sources/vidsrc/index.ts b/src/providers/sources/vidsrc/index.ts new file mode 100644 index 0000000..f7ad792 --- /dev/null +++ b/src/providers/sources/vidsrc/index.ts @@ -0,0 +1,13 @@ +import { flags } from '@/main/targets'; +import { makeSourcerer } from '@/providers/base'; +import { scrapeMovie } from '@/providers/sources/vidsrc/scrape-movie'; +import { scrapeShow } from '@/providers/sources/vidsrc/scrape-show'; + +export const vidsrcScraper = makeSourcerer({ + id: 'vidsrc', + name: 'VidSrc', + rank: 120, + flags: [flags.NO_CORS], + scrapeMovie, + scrapeShow, +}); diff --git a/src/providers/sources/vidsrc/scrape-movie.ts b/src/providers/sources/vidsrc/scrape-movie.ts new file mode 100644 index 0000000..585eb31 --- /dev/null +++ b/src/providers/sources/vidsrc/scrape-movie.ts @@ -0,0 +1,8 @@ +import { getVidSrcMovieSources } from '@/providers/sources/vidsrc/scrape'; +import { MovieScrapeContext } from '@/utils/context'; + +export async function scrapeMovie(ctx: MovieScrapeContext) { + return { + embeds: await getVidSrcMovieSources(ctx), + }; +} diff --git a/src/providers/sources/vidsrc/scrape-show.ts b/src/providers/sources/vidsrc/scrape-show.ts new file mode 100644 index 0000000..ff5d2a4 --- /dev/null +++ b/src/providers/sources/vidsrc/scrape-show.ts @@ -0,0 +1,8 @@ +import { getVidSrcShowSources } from '@/providers/sources/vidsrc/scrape'; +import { ShowScrapeContext } from '@/utils/context'; + +export async function scrapeShow(ctx: ShowScrapeContext) { + return { + embeds: await getVidSrcShowSources(ctx), + }; +} diff --git a/src/providers/sources/vidsrc/scrape.ts b/src/providers/sources/vidsrc/scrape.ts new file mode 100644 index 0000000..6ea5256 --- /dev/null +++ b/src/providers/sources/vidsrc/scrape.ts @@ -0,0 +1,141 @@ +import { load } from 'cheerio'; + +import { FetchReply } from '@/fetchers/fetch'; +import { SourcererEmbed } from '@/providers/base'; +import { streambucketScraper } from '@/providers/embeds/streambucket'; +import { vidsrcembedScraper } from '@/providers/embeds/vidsrc'; +import { vidsrcBase, vidsrcRCPBase } from '@/providers/sources/vidsrc/common'; +import { MovieScrapeContext, ShowScrapeContext } from '@/utils/context'; + +function decodeSrc(encoded: string, seed: string) { + const encodedBuffer = Buffer.from(encoded, 'hex'); + let decoded = ''; + + for (let i = 0; i < encodedBuffer.length; i++) { + decoded += String.fromCharCode(encodedBuffer[i] ^ seed.charCodeAt(i % seed.length)); + } + + return decoded; +} + +async function getVidSrcEmbeds(ctx: MovieScrapeContext | ShowScrapeContext, startingURL: string) { + // VidSrc works by using hashes and a redirect system. + // The hashes are stored in the html, and VidSrc will + // make requests to their servers with the hash. This + // will trigger a 302 response with a Location header + // sending the user to the correct embed. To get the + // real embed links, we must do the same. Slow, but + // required + + const embeds: SourcererEmbed[] = []; + + let html = await ctx.proxiedFetcher(startingURL, { + baseUrl: vidsrcBase, + }); + + let $ = load(html); + + const sourceHashes = $('.server[data-hash]') + .toArray() + .map((el) => $(el).attr('data-hash')) + .filter((hash) => hash !== undefined); + + for (const hash of sourceHashes) { + html = await ctx.proxiedFetcher(`/rcp/${hash}`, { + baseUrl: vidsrcRCPBase, + headers: { + referer: `${vidsrcBase}${startingURL}`, + }, + }); + + $ = load(html); + const encoded = $('#hidden').attr('data-h'); + const seed = $('body').attr('data-i'); + + if (!encoded || !seed) { + throw new Error('Failed to find encoded iframe src'); + } + + let redirectURL = decodeSrc(encoded, seed); + if (redirectURL.startsWith('//')) { + redirectURL = `https:${redirectURL}`; + } + + // Return the raw fetch response here. + // When a Location header is sent, fetch + // will silently follow it. The "url" inside + // the Response is the final requested URL, + // which is the real embeds URL + const { url: embedURL } = await ctx.proxiedFetcher(redirectURL, { + returnRaw: true, + method: 'HEAD', // We don't care about the actual response body here + headers: { + referer: `${vidsrcRCPBase}/rcp/${hash}`, + }, + }); + + const embed: SourcererEmbed = { + embedId: '', + url: embedURL, + }; + + const parsedUrl = new URL(embedURL); + + switch (parsedUrl.host) { + case 'vidsrc.stream': + embed.embedId = vidsrcembedScraper.id; + embed.headers = { + referer: `${vidsrcRCPBase}/rcp/${hash}`, + }; + break; + case 'streambucket.net': + embed.embedId = streambucketScraper.id; + break; + case '2embed.cc': + case 'www.2embed.cc': + // Just ignore this. This embed just sources from other embeds we can scrape as a 'source' + break; + case 'player-cdn.com': + // Just ignore this. This embed streams video over a custom WebSocket connection + break; + default: + throw new Error(`Failed to find VidSrc embed source for ${embedURL}`); + } + + // Since some embeds are ignored on purpose, check if a valid one was found + if (embed.embedId !== '') { + embeds.push(embed); + } + } + + return embeds; +} + +export async function getVidSrcMovieSources(ctx: MovieScrapeContext) { + return getVidSrcEmbeds(ctx, `/embed/${ctx.media.tmdbId}`); +} + +export async function getVidSrcShowSources(ctx: ShowScrapeContext) { + // VidSrc will always default to season 1 episode 1 + // no matter what embed URL is used. It sends back + // a list of ALL the shows episodes, in order, for + // all seasons. To get the real embed URL, have to + // parse this from the response + const html = await ctx.proxiedFetcher(`/embed/${ctx.media.tmdbId}`, { + baseUrl: vidsrcBase, + }); + + const $ = load(html); + + const episodeElement = $(`.ep[data-s="${ctx.media.season.number}"][data-e="${ctx.media.episode.number}"]`).first(); + if (episodeElement.length === 0) { + throw new Error('failed to find episode element'); + } + + const startingURL = episodeElement.attr('data-iframe'); + if (!startingURL) { + throw new Error('failed to find episode starting URL'); + } + + return getVidSrcEmbeds(ctx, startingURL); +} diff --git a/src/utils/context.ts b/src/utils/context.ts index 1a84253..aad2711 100644 --- a/src/utils/context.ts +++ b/src/utils/context.ts @@ -9,6 +9,7 @@ export type ScrapeContext = { export type EmbedInput = { url: string; + headers?: Record; }; export type EmbedScrapeContext = EmbedInput & ScrapeContext;