Skip to content

Commit d932692

Browse files
committed
fix get youtube captions
1 parent d60308c commit d932692

File tree

2 files changed

+106
-219
lines changed

2 files changed

+106
-219
lines changed

scripts/libs/utils/xmlParser.js

Lines changed: 2 additions & 195 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
// https://stackoverflow.com/a/1773571
2-
1+
// https://stackoverflow.com/a/1773571/11898496
32
export function parseXml(xml) {
43
var dom = null;
54
if (window.DOMParser) {
@@ -23,196 +22,4 @@ export function parseXml(xml) {
2322
return dom;
2423
}
2524

26-
export function xml2json(xml, tab) {
27-
var X = {
28-
toObj: function (xml) {
29-
var o = {};
30-
if (xml.nodeType == 1) {
31-
// element node ..
32-
if (xml.attributes.length)
33-
// element with attributes ..
34-
for (var i = 0; i < xml.attributes.length; i++)
35-
o["@" + xml.attributes[i].nodeName] = (
36-
xml.attributes[i].nodeValue || ""
37-
).toString();
38-
if (xml.firstChild) {
39-
// element has child nodes ..
40-
var textChild = 0,
41-
cdataChild = 0,
42-
hasElementChild = false;
43-
for (var n = xml.firstChild; n; n = n.nextSibling) {
44-
if (n.nodeType == 1) hasElementChild = true;
45-
else if (n.nodeType == 3 && n.nodeValue.match(/[^ \f\n\r\t\v]/))
46-
textChild++; // non-whitespace text
47-
else if (n.nodeType == 4) cdataChild++; // cdata section node
48-
}
49-
if (hasElementChild) {
50-
if (textChild < 2 && cdataChild < 2) {
51-
// structured element with evtl. a single text or/and cdata node ..
52-
X.removeWhite(xml);
53-
for (var n = xml.firstChild; n; n = n.nextSibling) {
54-
if (n.nodeType == 3)
55-
// text node
56-
o["#text"] = X.escape(n.nodeValue);
57-
else if (n.nodeType == 4)
58-
// cdata node
59-
o["#cdata"] = X.escape(n.nodeValue);
60-
else if (o[n.nodeName]) {
61-
// multiple occurence of element ..
62-
if (o[n.nodeName] instanceof Array)
63-
o[n.nodeName][o[n.nodeName].length] = X.toObj(n);
64-
else o[n.nodeName] = [o[n.nodeName], X.toObj(n)];
65-
} // first occurence of element..
66-
else o[n.nodeName] = X.toObj(n);
67-
}
68-
} else {
69-
// mixed content
70-
if (!xml.attributes.length) o = X.escape(X.innerXml(xml));
71-
else o["#text"] = X.escape(X.innerXml(xml));
72-
}
73-
} else if (textChild) {
74-
// pure text
75-
if (!xml.attributes.length) o = X.escape(X.innerXml(xml));
76-
else o["#text"] = X.escape(X.innerXml(xml));
77-
} else if (cdataChild) {
78-
// cdata
79-
if (cdataChild > 1) o = X.escape(X.innerXml(xml));
80-
else
81-
for (var n = xml.firstChild; n; n = n.nextSibling)
82-
o["#cdata"] = X.escape(n.nodeValue);
83-
}
84-
}
85-
if (!xml.attributes.length && !xml.firstChild) o = null;
86-
} else if (xml.nodeType == 9) {
87-
// document.node
88-
o = X.toObj(xml.documentElement);
89-
} else alert("unhandled node type: " + xml.nodeType);
90-
return o;
91-
},
92-
toJson: function (o, name, ind) {
93-
var json = name ? '"' + name + '"' : "";
94-
if (o instanceof Array) {
95-
for (var i = 0, n = o.length; i < n; i++)
96-
o[i] = X.toJson(o[i], "", ind + "\t");
97-
json +=
98-
(name ? ":[" : "[") +
99-
(o.length > 1
100-
? "\n" + ind + "\t" + o.join(",\n" + ind + "\t") + "\n" + ind
101-
: o.join("")) +
102-
"]";
103-
} else if (o == null) json += (name && ":") + "null";
104-
else if (typeof o == "object") {
105-
var arr = [];
106-
for (var m in o) arr[arr.length] = X.toJson(o[m], m, ind + "\t");
107-
json +=
108-
(name ? ":{" : "{") +
109-
(arr.length > 1
110-
? "\n" + ind + "\t" + arr.join(",\n" + ind + "\t") + "\n" + ind
111-
: arr.join("")) +
112-
"}";
113-
} else if (typeof o == "string")
114-
json += (name && ":") + '"' + o.toString() + '"';
115-
else json += (name && ":") + o.toString();
116-
return json;
117-
},
118-
innerXml: function (node) {
119-
var s = "";
120-
if ("innerHTML" in node) s = node.innerHTML;
121-
else {
122-
var asXml = function (n) {
123-
var s = "";
124-
if (n.nodeType == 1) {
125-
s += "<" + n.nodeName;
126-
for (var i = 0; i < n.attributes.length; i++)
127-
s +=
128-
" " +
129-
n.attributes[i].nodeName +
130-
'="' +
131-
(n.attributes[i].nodeValue || "").toString() +
132-
'"';
133-
if (n.firstChild) {
134-
s += ">";
135-
for (var c = n.firstChild; c; c = c.nextSibling) s += asXml(c);
136-
s += "</" + n.nodeName + ">";
137-
} else s += "/>";
138-
} else if (n.nodeType == 3) s += n.nodeValue;
139-
else if (n.nodeType == 4) s += "<![CDATA[" + n.nodeValue + "]]>";
140-
return s;
141-
};
142-
for (var c = node.firstChild; c; c = c.nextSibling) s += asXml(c);
143-
}
144-
return s;
145-
},
146-
escape: function (txt) {
147-
return txt
148-
.replace(/[\\]/g, "\\\\")
149-
.replace(/[\"]/g, '\\"')
150-
.replace(/[\n]/g, "\\n")
151-
.replace(/[\r]/g, "\\r");
152-
},
153-
removeWhite: function (e) {
154-
e.normalize();
155-
for (var n = e.firstChild; n; ) {
156-
if (n.nodeType == 3) {
157-
// text node
158-
if (!n.nodeValue.match(/[^ \f\n\r\t\v]/)) {
159-
// pure whitespace text node
160-
var nxt = n.nextSibling;
161-
e.removeChild(n);
162-
n = nxt;
163-
} else n = n.nextSibling;
164-
} else if (n.nodeType == 1) {
165-
// element node
166-
X.removeWhite(n);
167-
n = n.nextSibling;
168-
} // any other node
169-
else n = n.nextSibling;
170-
}
171-
return e;
172-
},
173-
};
174-
if (xml.nodeType == 9)
175-
// document node
176-
xml = xml.documentElement;
177-
var json = X.toJson(X.toObj(X.removeWhite(xml)), xml.nodeName, "\t");
178-
return (
179-
"{\n" +
180-
tab +
181-
(tab ? json.replace(/\t/g, tab) : json.replace(/\t|\n/g, "")) +
182-
"\n}"
183-
);
184-
}
185-
186-
export function json2xml(o, tab) {
187-
var toXml = function (v, name, ind) {
188-
var xml = "";
189-
if (v instanceof Array) {
190-
for (var i = 0, n = v.length; i < n; i++)
191-
xml += ind + toXml(v[i], name, ind + "\t") + "\n";
192-
} else if (typeof v == "object") {
193-
var hasChild = false;
194-
xml += ind + "<" + name;
195-
for (var m in v) {
196-
if (m.charAt(0) == "@")
197-
xml += " " + m.substr(1) + '="' + v[m].toString() + '"';
198-
else hasChild = true;
199-
}
200-
xml += hasChild ? ">" : "/>";
201-
if (hasChild) {
202-
for (var m in v) {
203-
if (m == "#text") xml += v[m];
204-
else if (m == "#cdata") xml += "<![CDATA[" + v[m] + "]]>";
205-
else if (m.charAt(0) != "@") xml += toXml(v[m], m, ind + "\t");
206-
}
207-
xml +=
208-
(xml.charAt(xml.length - 1) == "\n" ? ind : "") + "</" + name + ">";
209-
}
210-
} else {
211-
xml += ind + "<" + name + ">" + v.toString() + "</" + name + ">";
212-
}
213-
return xml;
214-
},
215-
xml = "";
216-
for (var m in o) xml += toXml(o[m], m, "");
217-
return tab ? xml.replace(/\t/g, tab) : xml.replace(/\t|\n/g, "");
218-
}
25+
// https://stackoverflow.com/a/20861541/11898496

scripts/youtube_getVideoCaption.js

Lines changed: 104 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import { UfsGlobal } from "./content-scripts/ufs_global.js";
12
import { BADGES } from "./helpers/badge.js";
23

34
export default {
@@ -18,15 +19,17 @@ export default {
1819
whiteList: ["https://*.youtube.com/*"],
1920

2021
pageScript: {
21-
onClick: () => {
22-
function renderCaptions(captions) {
22+
onClick: async () => {
23+
const { parseXml } = await import("./libs/utils/xmlParser.js");
24+
25+
function renderCaptions(captions, title) {
2326
const id = "ufs_youtube_getVideoCaption";
2427
const exist = document.getElementById(id);
2528
if (exist) exist.remove();
2629

2730
const div = document.createElement("div");
2831
div.id = id;
29-
div.innerHTML = `
32+
div.innerHTML = /*html*/ `
3033
<style>
3134
#${id} {
3235
position: fixed;
@@ -60,58 +63,135 @@ export default {
6063
overflow-x: hidden;
6164
}
6265
#${id} button {
63-
position: absolute;
64-
top: 10px;
65-
right: 10px;
66-
padding: 10px;
67-
background: #eee;
68-
border-radius: 5px;
66+
display: inline-block;
6967
cursor: pointer;
7068
}
7169
</style>
7270
<div>
73-
<button>Close</button>
7471
<h3>Captions</h3><br/>
7572
<ul>
7673
${captions
7774
.map(
7875
(caption) => `<li>
79-
<a href="${caption.baseUrl}" target="_blank">
80-
${caption.name.simpleText} (${caption.languageCode})
81-
</a>
76+
${caption.name.simpleText} (${caption.languageCode})
77+
<a href="${caption.baseUrl}" data-type="srt">srt</a>
78+
<a href="${caption.baseUrl}" data-type="txt">txt</a>
79+
<a href="${caption.baseUrl}" target="_blank">xml</a>
8280
</li>`
8381
)
8482
.join("")}
8583
</ul>
84+
<br/>
85+
<a href="https://downsub.com/?url=${encodeURIComponent(
86+
location.href
87+
)}" target="_blank">Auto translate</a>
8688
</div>
8789
`;
88-
const button = div.querySelector("button");
89-
button.onclick = () => {
90-
div.remove();
90+
div.onclick = async (e) => {
91+
if (e.target == div) div.remove();
92+
if (e.target.tagName == "A") {
93+
const type = e.target.getAttribute("data-type");
94+
if (type) {
95+
e.preventDefault();
96+
downloadCaption(e.target.getAttribute("href"), type, title);
97+
}
98+
}
9199
};
92100
document.documentElement.appendChild(div);
93101
}
94102

103+
async function downloadCaption(url, type, title) {
104+
try {
105+
const res = await fetch(url, {
106+
headers: {
107+
contentType: "text/xml",
108+
},
109+
});
110+
const text = await res.text();
111+
const xml = parseXml(text);
112+
113+
const transcript = xml.getElementsByTagName("transcript")[0];
114+
const texts = Array.from(transcript.getElementsByTagName("text"));
115+
116+
if (type === "txt") {
117+
const data = texts
118+
.map((t) => decodeHtmlEntities(t.textContent))
119+
.join("\n");
120+
alert(data);
121+
// UfsGlobal.Utils.downloadData(data, title + ".txt");
122+
} else if (type === "srt") {
123+
const data = texts
124+
.map((t, i) => {
125+
// 1
126+
// 00:00:00,120 --> 00:00:01,880
127+
// mùa tuyển sinh sắp đến rồi nên là hôm
128+
let index = i + 1;
129+
let start = Number(t.getAttribute("start"));
130+
let dur = Number(t.getAttribute("dur"));
131+
let end = start + dur;
132+
return (
133+
index +
134+
"\n" +
135+
formatTimeToSRT(start) +
136+
" --> " +
137+
formatTimeToSRT(end) +
138+
"\n" +
139+
decodeHtmlEntities(t.textContent)
140+
);
141+
})
142+
.join("\n\n");
143+
UfsGlobal.Utils.downloadData(data, title + ".srt");
144+
}
145+
} catch (e) {
146+
alert(e);
147+
console.error(e);
148+
}
149+
}
150+
151+
const pad = (num, len = 2) => num.toString().padStart(len, "0");
152+
153+
// 6.120 -> 00:00:06,120
154+
function formatTimeToSRT(seconds) {
155+
// Get hours, minutes, and seconds
156+
const hours = Math.floor(seconds / 3600);
157+
const minutes = Math.floor((seconds % 3600) / 60);
158+
const secs = Math.floor(seconds % 60);
159+
const milliseconds = Math.floor((seconds % 1) * 1000);
160+
// Format the time string
161+
return `${pad(hours)}:${pad(minutes)}:${pad(secs)},${pad(
162+
milliseconds,
163+
3
164+
)}`;
165+
}
166+
167+
var textArea = document.createElement("textarea");
168+
function decodeHtmlEntities(text) {
169+
textArea.innerHTML = text;
170+
return textArea.innerHTML;
171+
}
172+
95173
const methods = [
96-
() =>
97-
document.getElementsByTagName("ytd-app")[0].data.playerResponse
98-
.captions.playerCaptionsTracklistRenderer.captionTracks,
99-
() =>
100-
ytplayer.config.args.raw_player_response.captions
101-
.playerCaptionsTracklistRenderer.captionTracks,
174+
() => document.getElementsByTagName("ytd-app")[0].data.playerResponse,
175+
() => ytplayer.config.args.raw_player_response,
102176
];
103177

104178
for (let f of methods) {
105179
try {
106-
let captions = f();
180+
let p = f();
181+
let captions =
182+
p.captions.playerCaptionsTracklistRenderer.captionTracks;
183+
let title = p.videoDetails?.title || document.title;
184+
107185
if (captions) {
108-
renderCaptions(captions);
186+
renderCaptions(captions, title);
109187
return;
110188
}
111189
} catch (e) {
112190
console.error(e);
113191
}
114192
}
193+
194+
alert("No captions found");
115195
},
116196
},
117197
};

0 commit comments

Comments
 (0)