Multimodal Browser AI with Transformers.js for Images and Speech

AIArt
-June 13, 2026
- No Comments

<title>Multimodal Media Analyzer</title>

* { box–sizing: border–box; margin: 0; padding: 0; }

body {

font–family: system–ui, sans–serif;

max–width: 820px;

margin: 0 auto;

padding: 1.5rem 1rem;

background: #f1f5f9;

color: #1e293b;

}

header { margin–bottom: 1.5rem; }

header h1 { font–size: 1.5rem; }

header p { color: #64748b; font-size: 0.9rem; margin-top: 0.2rem; }

/* Model status indicators */

.model–status–bar {

display: flex;

gap: 0.5rem;

flex–wrap: wrap;

margin–top: 0.75rem;

}

.model–badge {

font–size: 0.78rem;

padding: 0.2rem 0.6rem;

border–radius: 12px;

background: #fef3c7;

color: #92400e;

}

.model–badge.ready { background: #dcfce7; color: #15803d; }

/* Tab bar */

.tabs {

display: flex;

background: white;

border–radius: 8px;

padding: 0.25rem;

gap: 0.25rem;

margin–bottom: 1.25rem;

border: 1px solid #e2e8f0;

}

.tab {

flex: 1;

padding: 0.5rem;

text–align: center;

border–radius: 6px;

cursor: pointer;

font–size: 0.9rem;

color: #64748b;

transition: all 0.15s;

}

.tab.active { background: #2563eb; color: white; font-weight: 600; }

/* Input panels */

.panel { display: none; }

.panel.active { display: block; }

.upload–area {

background: white;

border: 2px dashed #cbd5e1;

border–radius: 8px;

padding: 2rem;

text–align: center;

cursor: pointer;

}

.upload–area input { display: none; }

#img-preview {

margin–top: 1rem;

max–width: 100%;

max–height: 320px;

border–radius: 8px;

display: none;

object–fit: cover;

}

.mic–center { text–align: center; padding: 1rem 0; }

#rec-btn {

width: 72px; height: 72px;

border–radius: 50%; border: none;

background: #dc2626; color: white;

font–size: 1.6rem; cursor: pointer;

display: flex; align–items: center; justify–content: center;

margin: 0 auto 0.5rem;

}

#rec-btn.recording { background: #374151; }

#rec-btn:disabled { background: #94a3b8; cursor: not-allowed; }

#rec-timer { font-weight: 600; color: #374151; margin-bottom: 0.25rem; }

#rec-hint { font-size: 0.85rem; color: #64748b; }

#wave-canvas { display: block; margin: 0.5rem auto; border-radius: 4px; }

/* Results grid */

.results–grid {

display: grid;

grid–template–columns: repeat(auto–fit, minmax(220px, 1fr));

gap: 1rem;

margin–top: 1.25rem;

}

.result–card {

background: white;

border: 1px solid #e2e8f0;

border–radius: 8px;

padding: 1rem;

}

.result–card h3 {

font–size: 0.75rem;

text–transform: uppercase;

letter–spacing: 0.06em;

color: #64748b;

margin–bottom: 0.6rem;

}

.label–item {

display: flex;

justify–content: space–between;

align–items: center;

padding: 0.25rem 0;

font–size: 0.875rem;

border–bottom: 1px solid #f1f5f9;

}

.label–score {

font–size: 0.8rem;

color: #64748b;

background: #f1f5f9;

padding: 0.1rem 0.4rem;

border–radius: 4px;

}

.caption–body {

font–size: 0.95rem;

line–height: 1.5;

font–style: italic;

color: #334155;

}

.transcript–body {

font–size: 0.95rem;

line–height: 1.6;

color: #334155;

white–space: pre–wrap;

}

.placeholder–text { color: #94a3b8; font-style: italic; font-size: 0.9rem; }

#global-status {

font–size: 0.85rem;

color: #64748b;

margin–bottom: 1rem;

}

@media (max–width: 500px) {

.results–grid { grid–template–columns: 1fr; }

}

<h1>Multimodal Media Analyzer</h1>

<p>Image classification, captioning, and speech transcription — all in your browser.</p>

<span class=“model-badge” id=“badge-cls”>Classifier: loading...</span>

<span class=“model-badge” id=“badge-cap”>Captioner: loading...</span>

<span class=“model-badge” id=“badge-asr”>Whisper: loading...</span>

</div>

</header>

<div id=“global-status”>Loading models in parallel — first run downloads ~400 MB total.</div>

<div class=“tab active” data–tab=“image”>🖼 Image Analysis</div>

<div class=“tab” data–tab=“speech”>🎙 Speech Transcription</div>

</div>

<!— Image panel —>

<p>Click or drag an image to analyze</p>

JPG, PNG, WebP, GIF supported

</p>

</div>

</div>

<!— Speech panel —>

<div id=“rec-hint”>Waiting for Whisper model...</div>

</div>

<!— Results – shown for both modes —>

<!— Image results (shown in image mode) —>

<h3>Classification</h3>

<p class=“placeholder-text”>No results yet.</p>

</div>

<h3>Caption</h3>

<p class=“placeholder-text”>No caption yet.</p>

</div>

<!— Speech results (shown in speech mode) —>

<h3>Transcription</h3>

<p class=“placeholder-text”>Record audio to see the transcription.</p>

</div>

import { pipeline }

from ‘https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.0.2’;

// ── Pipeline references ───────────────────────────────────────────────

let classifier, captioner, transcriber;

let readyCount = 0;

// Update a model badge to “ready” state

function markReady(badgeId, label) {

const badge = document.getElementById(badgeId);

badge.textContent = `${label}: ready`;

badge.classList.add(‘ready’);

readyCount++;

if (readyCount === 3) {

globalStatus.textContent =

‘All models ready. Upload an image or record audio.’;

recBtn.disabled = false;

recHint.textContent = ‘Click to start recording.’;

}

// Load all three pipelines simultaneously

Promise.all([

pipeline(‘image-classification’, ‘Xenova/vit-base-patch16-224’, {

dtype: ‘q8’,

progress_callback: p => p.status === ‘done’ && markReady(‘badge-cls’, ‘Classifier’)

}),

pipeline(‘image-to-text’, ‘Xenova/vit-gpt2-image-captioning’, {

dtype: ‘q8’,

progress_callback: p => p.status === ‘done’ && markReady(‘badge-cap’, ‘Captioner’)

}),

pipeline(‘automatic-speech-recognition’, ‘Xenova/whisper-tiny.en’, {

dtype: ‘q8’,

progress_callback: p => p.status === ‘done’ && markReady(‘badge-asr’, ‘Whisper’)

})

]).then(([cls, cap, asr]) => {

classifier = cls;

captioner = cap;

transcriber = asr;

}).catch(err => {

globalStatus.textContent = `Error loading models: ${err.message}`;

});

// ── UI references ─────────────────────────────────────────────────────

const globalStatus = document.getElementById(‘global-status’);

const resultsGrid = document.getElementById(‘results-grid’);

const recBtn = document.getElementById(‘rec-btn’);

const recHint = document.getElementById(‘rec-hint’);

const recTimer = document.getElementById(‘rec-timer’);

const waveCanvas = document.getElementById(‘wave-canvas’);

const waveCtx = waveCanvas.getContext(‘2d’);

// ── Image analysis ────────────────────────────────────────────────────

async function analyzeImage(dataUrl) {

if (!classifier || !captioner) {

globalStatus.textContent = ‘Models still loading. Please wait.’;

return;

}

globalStatus.textContent = ‘Running classification and captioning…’;

// Show image result cards, hide speech card

document.getElementById(‘card-cls’).style.display = ‘block’;

document.getElementById(‘card-cap’).style.display = ‘block’;

document.getElementById(‘card-asr’).style.display = ‘none’;

resultsGrid.style.display = ‘grid’;

document.getElementById(‘cls-content’).innerHTML =

‘<p class=”placeholder-text”>Classifying…</p>’;

document.getElementById(‘cap-content’).innerHTML =

‘<p class=”placeholder-text”>Generating caption…</p>’;

try {

// Run classification and captioning in parallel

const [classResults, captionResults] = await Promise.all([

classifier(dataUrl, { top_k: 4 }),

captioner(dataUrl, { max_new_tokens: 60 })

]);

// Render classification labels

document.getElementById(‘cls-content’).innerHTML =

classResults.map(({ label, score }) => `

<span>${label}</span>

<span class=“label-score”>${(score * 100).toFixed(1)}%</span>

</div>`).join(”);

// Render generated caption

document.getElementById(‘cap-content’).innerHTML =

`<p class=“caption-body”>“${captionResults[0]?.generated_text ?? ‘No caption.’}”</p>`;

globalStatus.textContent = ‘Analysis complete.’;

} catch (err) {

globalStatus.textContent = `Error: ${err.message}`;

}

// File upload handler for images

const imgDrop = document.getElementById(‘img-drop’);

const imgInput = document.getElementById(‘img-input’);

const imgPrev = document.getElementById(‘img-preview’);

function handleImageFile(file) {

if (!file?.type.startsWith(‘image/’)) return;

const reader = new FileReader();

reader.onload = e => {

imgPrev.src = e.target.result;

imgPrev.style.display = ‘block’;

analyzeImage(e.target.result);

};

reader.readAsDataURL(file);

}

imgDrop.addEventListener(‘click’, () => imgInput.click());

imgInput.addEventListener(‘change’, e => handleImageFile(e.target.files[0]));

imgDrop.addEventListener(‘dragover’, e => e.preventDefault());

imgDrop.addEventListener(‘drop’, e => {

e.preventDefault();

handleImageFile(e.dataTransfer.files[0]);

});

// ── Audio decoding helper ─────────────────────────────────────────────

async function decodeAudio(arrayBuffer) {

const audioCtx = new AudioContext({ sampleRate: 16000 });

const audioBuffer = await audioCtx.decodeAudioData(arrayBuffer);

return audioBuffer.getChannelData(0); // Mono Float32Array at 16kHz

}

// ── Speech transcription ──────────────────────────────────────────────

async function runTranscription(audioData) {

// Show speech result card, hide image cards

document.getElementById(‘card-cls’).style.display = ‘none’;

document.getElementById(‘card-cap’).style.display = ‘none’;

document.getElementById(‘card-asr’).style.display = ‘block’;

resultsGrid.style.display = ‘grid’;

document.getElementById(‘asr-content’).innerHTML =

‘<p class=”placeholder-text”>Transcribing…</p>’;

globalStatus.textContent = ‘Running Whisper transcription…’;

try {

const result = await transcriber(audioData, {

chunk_length_s: 30,

stride_length_s: 5

});

document.getElementById(‘asr-content’).innerHTML =

`<p class=“transcript-body”>${result.text.trim()}</p>`;

globalStatus.textContent = ‘Transcription complete.’;

} catch (err) {

globalStatus.textContent = `Error: ${err.message}`;

}

// ── Microphone recording ──────────────────────────────────────────────

let mediaRecorder, audioChunks = [], timerInterval, analyserNode, animId;

let secs = 0;

function drawWave() {

const buf = new Uint8Array(analyserNode.frequencyBinCount);

analyserNode.getByteTimeDomainData(buf);

waveCtx.clearRect(0, 0, waveCanvas.width, waveCanvas.height);

waveCtx.beginPath();

waveCtx.strokeStyle = ‘#2563eb’;

waveCtx.lineWidth = 1.5;

buf.forEach((v, i) => {

const x = (i / buf.length) * waveCanvas.width;

const y = (v / 128.0) * (waveCanvas.height / 2);

i === 0 ? waveCtx.moveTo(x, y) : waveCtx.lineTo(x, y);

});

waveCtx.stroke();

animId = requestAnimationFrame(drawWave);

}

recBtn.addEventListener(‘click’, async () => {

if (mediaRecorder?.state === ‘recording’) {

mediaRecorder.stop();

recBtn.classList.remove(‘recording’);

recBtn.textContent = ‘🎙’;

clearInterval(timerInterval);

cancelAnimationFrame(animId);

waveCtx.clearRect(0, 0, waveCanvas.width, waveCanvas.height);

recHint.textContent = ‘Processing…’;

} else {

try {

const stream = await navigator.mediaDevices.getUserMedia({ audio: true });

const actx = new AudioContext();

analyserNode = actx.createAnalyser();

actx.createMediaStreamSource(stream).connect(analyserNode);

analyserNode.fftSize = 256;

mediaRecorder = new MediaRecorder(stream);

audioChunks = [];

mediaRecorder.ondataavailable = e => e.data.size && audioChunks.push(e.data);

mediaRecorder.onstop = async () => {

const blob = new Blob(audioChunks, { type: ‘audio/webm’ });

const arrayBuffer = await blob.arrayBuffer();

const audioData = await decodeAudio(arrayBuffer);

stream.getTracks().forEach(t => t.stop());

await runTranscription(audioData);

recHint.textContent = ‘Click to record again.’;

};

mediaRecorder.start();

recBtn.classList.add(‘recording’);

recBtn.textContent = ‘⏹’;

secs = 0;

recTimer.textContent = ‘0:00’;

timerInterval = setInterval(() => {

secs++;

recTimer.textContent =

`${Math.floor(secs / 60)}:${String(secs % 60).padStart(2, ‘0’)}`;

}, 1000);

recHint.textContent = ‘Recording… click to stop.’;

drawWave();

} catch (err) {

recHint.textContent = `Mic error: ${err.message}`;

}

});

// ── Tab switching ─────────────────────────────────────────────────────

document.querySelectorAll(‘.tab’).forEach(tab => {

tab.addEventListener(‘click’, () => {

document.querySelectorAll(‘.tab, .panel’).forEach(el =>

el.classList.remove(‘active’));

tab.classList.add(‘active’);

document.getElementById(`panel–${tab.dataset.tab}`).classList.add(‘active’);

});

crossroad.joykonark.com

Writer & Blogger

Considered an invitation do introduced sufficient understood instrument it. Of decisively friendship in as collecting at. No affixed be husband ye females brother garrets proceed. Least child who seven happy yet balls young. Discovery sweetness principle discourse shameless bed one excellent. Sentiments of surrounded friendship dispatched connection is he.

About Me

Kapil Kumar

Founder & Editor

As a passionate explorer of the intersection between technology, art, and the natural world, I’ve embarked on a journey to unravel the fascinating connections that weave our world together. In my digital haven, you’ll find a blend of insights into cutting-edge technology, the mesmerizing realms of artificial intelligence, the expressive beauty of art.

Instagram

Follow on Instagram

Edit Template

Subscribe Now

Subscribe Now

Multimodal Browser AI with Transformers.js for Images and Speech

crossroad.joykonark.com

Writer & Blogger

Leave a Reply Cancel reply

About Me

Kapil Kumar

Founder & Editor

Popular Articles

Access Denied

Access Denied

Access Denied

Instagram

Quick Links

Home

Features

Terms & Conditions

Privacy Policy

Contact

Recent Posts

Access Denied

Access Denied

Contact Us

Quick Links

Home

Features

Terms & Conditions

Privacy Policy

Contact

Recent Posts

Access Denied

Access Denied

Contact Us

Fill Your Contact Details

Fill out this form, and we’ll reach out to you through WhatsApp for further communication.