Web scraping
Apparence
Html Agility Pack
using HtmlAgilityPack; // nuget HtmlAgilityPack HttpClient httpClient = new HttpClient(); var url = $"https://www.google.com/search?q={query.Replace(" ", "+")}"; var response = await httpClient.GetAsync(url); var htmlContent = await response.Content.ReadAsStringAsync(); // parse html var pageDocument = new HtmlDocument(); pageDocument.LoadHtml(htmlContent); // requete XPath var node = pageDocument.DocumentNode.SelectSingleNode("//div[contains(@id, 'my-id')]"); var nodes = pageDocument.DocumentNode.SelectNodes("//div[contains(@id, 'myId')]//div[contains(@class, 'myClass')][1]//li"); |
API:Search Wikipedia
var url = $"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={query.Replace(" ", "+")}&srlimit=1&srnamespace=0&format=json"; var response = await httpClient.GetAsync(url); var jsonResult = await response.Content.ReadAsAsync<JObject>(); var title = jsonResult.SelectToken("query.search[0].title").Value<string>(); url = $"https://en.wikipedia.org/wiki/{title.Replace(" ", "_")}"; response = await httpClient.GetAsync(url); var htmlContent = await response.Content.ReadAsStringAsync(); var pageDocument = new HtmlDocument(); pageDocument.LoadHtml(htmlContent); string text = null; var nodes = pageDocument.DocumentNode.SelectNodes("//div[contains(@id, 'mw-content-text')]//p"); if (nodes != null) { text = nodes.First(n => !n.InnerText.StartsWith("\n")).InnerText; } while (text.Length > 500) { text = text.Remove(text.LastIndexOf(".")); } // remove the references [1] text = Regex.Replace(text, @"[\d{1,3}]", ""); if (!text.Trim().EndsWith(".")) { text += "."; } if (text.Contains("may refer to:")) { text = "Your search is a bit ambiguous, could you specify your search terms?"; } |