Collecting Words

This was a little side project of mine before I realized that English word lists are already out there on the internet—easily accessible with just a few clicks
At the time, I simply wanted to build a Hangman game and easy way to solve New York Times Wordle game.
My first thought was: why not collect the words myself? The internet is full of them anyway. I figured Wikipedia would be a great source, since its main page refreshes daily. That meant new words every single day 🙂.
So, I rolled up my sleeves and wrote a Google Apps Script on Google Sheets, scheduled to run daily. I also added another script that runs every Sunday to clean up duplicates.
Naturally, not all words were useful. I had to filter out:
Words containing numbers.
Words with uppercase characters (such as city or person names).
Words shorter than 3 characters.
Words containing special characters like underscores.
And finally, a custom exclusion list. Over time, I noticed Wikipedia’s HTML included lots of technical junk words (since I was parsing the entire HTML, not just the visible content). Stuff like eventlogging, www, or memusage. I kept track of those in a separate sheet—this part was manual.
Fast forward a bit: thanks to Google Apps Script triggers, the project basically ran itself. Before I knew it, I had collected more than 370,000 words.
Of course, the list still isn’t perfect—some words still need filtering, and both singular and plural forms sneak in. But overall, it was a fun experiment, and I learned a lot along the way.
function retrieveWordsFromWebPage() {
// URL of the web page
var url = "https://en.wikipedia.org/wiki/Main_Page";
// Fetch the content of the web page
var response;
try {
response = UrlFetchApp.fetch(url);
} catch (error) {
Logger.log("Error fetching webpage: " + error);
return;
}
// Check if the request was successful (HTTP status code 200)
if (response.getResponseCode() !== 200) {
Logger.log("Failed to fetch webpage. HTTP status code: " + response.getResponseCode());
return;
}
// Extract the words from the HTML content
var content = response.getContentText();
var words = extractWordsFromHTML(content);
// Eliminate duplicate words
var uniqueWords = eliminateDuplicates(words);
// Remove words containing numbers
var wordsWithoutNumbers = removeWordsWithNumbers(uniqueWords);
// Remove all uppercase words
var wordsWithoutUppercase = removeUppercaseWords(wordsWithoutNumbers);
// Remove two-character words
var wordsWithoutTwoCharacters = removeTwoCharacterWords(wordsWithoutUppercase);
// Remove words containing underscores, commas, or dots
var filteredWords = removeSpecialCharacters(wordsWithoutTwoCharacters);
// Remove specific words
var finalWords = eliminateWords(filteredWords)
// Write the unique words onto the "ListofWords" sheet
writeWordsToSheet(finalWords);
}
function extractWordsFromHTML(html) {
// Ensure that html is not undefined
if (!html) {
return [];
}
// Remove HTML tags
// var text = html.replace(/<[^>]*>/g, '');
var text = html.replace(/<[^>]*>/g, '').replace(/&[a-z]+;/gi, ''); // 2025-08-31 ChatGPT >> extractWordsFromHTML using regex is okay, but it might still leave behind junk like encoded HTML entities ( , &). You could clean them:
// Split the text into words
var words = text.match(/\b\w+\b/g);
return words || [];
}
// Eliminating duplicates from the HTML parse
function eliminateDuplicates(words) {
// Create a Set to store unique words
var uniqueWordsSet = new Set(words);
// Convert the Set back to an array
var uniqueWords = Array.from(uniqueWordsSet);
return uniqueWords;
}
// Eliminating the words with numbers
function removeWordsWithNumbers(words) {
// Filter out words containing numbers
var filteredWords = words.filter(function(word) {
return !/\d/.test(word); // Test if word contains a digit
});
return filteredWords;
}
// Eliminating upper case words
function removeUppercaseWords(words) {
// Filter out words with uppercase letters other than the first character
var filteredWords = words.filter(function(word) {
// Check if the word has uppercase letters other than the first character
for (var i = 0; i < word.length; i++) {
if (word[i] === word[i].toUpperCase()) {
return false; // Word contains uppercase letter(s) other than the first character
}
}
return true; // Word is valid (no uppercase letter(s) other than the first character)
});
return filteredWords;
}
// Eliminating length of the word < 3
function removeTwoCharacterWords(words) {
// Filter out two-character words
var filteredWords = words.filter(function(word) {
return word.length > 2; // Test if word has more than two characters
});
return filteredWords;
}
// Eliminating special characters
function removeSpecialCharacters(words) {
// Filter out words containing underscores, commas, or dots
var filteredWords = words.filter(function(word) {
return !(/[_,.]/.test(word)); // Test if word contains underscores, commas, or dots
});
return filteredWords;
}
// Read excluded words from EXCLUDED_WORDS!A column
function getExcludedWords() {
var sheet = SpreadsheetApp.getActiveSpreadsheet().getSheetByName("EXCLUDED_WORDS");
var lastRow = sheet.getLastRow();
if (lastRow < 1) return [];
// Read all values from column A (ignore empty cells)
var values = sheet.getRange(1, 1, lastRow, 1).getValues();
return values
.map(function(row) { return row[0]; })
.filter(function(word) { return word && word.toString().trim() !== ""; });
}
// Filter words against the sheet values
function eliminateWords(words) {
var excluded = getExcludedWords();
var excludedSet = new Set(excluded.map(function(w){ return w.toString().toLowerCase(); })); // case-insensitive
return words.filter(function(word) {
return !excludedSet.has(word.toString().toLowerCase());
});
}
// Write the word to the result sheet.
function writeWordsToSheet(words) {
// Get the active spreadsheet and the "ListofWords" sheet
var spreadsheet = SpreadsheetApp.getActiveSpreadsheet();
var sheet = spreadsheet.getSheetByName("ListofWords");
var lastRow = findLastRow("ListofWords", "A");
// Write the words onto the sheet
var numRows = words.length;
var range = sheet.getRange(lastRow + 1, 1, numRows, 1);
range.setValues(words.map(function(word) { return [word]; }));
}
// Remove Duplications on Column A
function removeColumnDuplicates() {
var spreadsheet = SpreadsheetApp.getActiveSpreadsheet();
var sheet = spreadsheet.getSheetByName("ListofWords");
//var sheet = SpreadsheetApp.getActive();
sheet.getRange('A:A').activate();
sheet.getActiveRange().offset(1, 0, sheet.getActiveRange().getNumRows() - 1).activate();
sheet.getActiveRange().removeDuplicates().activate();
sheet.getRange('C2').activate();
};
writeWordsToSheet
function uses findLastRow
function:
function findLastRow(sheetName, columnName) {
var spreadsheet = SpreadsheetApp.getActiveSpreadsheet();
var sheet = spreadsheet.getSheetByName(sheetName);
if (!sheet) {
//Logger.log("Sheet '" + sheetName + "' not found.");
return 0; // Return zero if the sheet is not found
}
var lastRow = sheet.getLastRow();
var columnValues = sheet.getRange(columnName + "1:" + columnName + lastRow).getValues();
for (var i = lastRow; i > 0; i--) {
if (columnValues[i - 1][0] !== "") {
return i;
}
}
// Return zero if the column is empty
return 0;
//Logger.log("Column '" + columnName + "' is empty in sheet '" + sheetName + "'.");
}