Skip to content

Creating a New Connector

This guide will walk you through creating a new connector for the unified property connector system. Connectors are responsible for scraping property data from real estate websites and submitting it to the Mill API.

Before you begin, ensure you have:

  • Go 1.24.9+ installed
  • Understanding of the common.Connector interface
  • Familiarity with web scraping (we use Colly)
  • Access to the target website’s structure

Create a new directory for your connector in connectors/sources/:

Terminal window
cd connectors/sources
mkdir my-new-connector
cd my-new-connector

Naming Convention:

  • Use lowercase with hyphens (e.g., realestate-com-au)
  • Match the domain name when possible
  • Keep names descriptive and unique

Create connector.go in your new directory. All connectors must implement the common.Connector interface:

package my_new_connector
import (
"fmt"
"log"
"strings"
"sync"
"time"
"github.com/gocolly/colly/v2"
"connectors/common"
)
// Connector implements the common.Connector interface
type Connector struct {
collector *colly.Collector
source string
rateLimit time.Duration
stats common.ConnectorStats
mu sync.RWMutex
}
// NewConnector creates a new connector instance
func NewConnector() common.Connector {
c := colly.NewCollector(
colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"),
)
// Set up rate limiting
c.Limit(&colly.LimitRule{
DomainGlob: "*example.com*",
Parallelism: 1,
Delay: 2 * time.Second,
})
// Add headers to appear browser-like
c.OnRequest(func(r *colly.Request) {
r.Headers.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
r.Headers.Set("Accept-Language", "en-US,en;q=0.5")
})
return &Connector{
collector: c,
source: "example.com",
rateLimit: 2 * time.Second,
stats: common.ConnectorStats{},
}
}
// GetName returns the connector name (used in CLI)
func (h *Connector) GetName() string {
return "my-new-connector"
}
// GetSource returns the source website domain
func (h *Connector) GetSource() string {
return h.source
}
// SetRateLimit configures rate limiting
func (h *Connector) SetRateLimit(delay time.Duration) {
h.mu.Lock()
defer h.mu.Unlock()
h.rateLimit = delay
h.collector.Limit(&colly.LimitRule{
DomainGlob: "*example.com*",
Parallelism: 1,
Delay: delay,
})
}
// GetStats returns current collection statistics
func (h *Connector) GetStats() common.ConnectorStats {
h.mu.RLock()
defer h.mu.RUnlock()
return h.stats
}
// HealthCheck verifies the connector can access the target website
func (h *Connector) HealthCheck() error {
testURL := "https://www.example.com"
err := h.collector.Visit(testURL)
if err != nil {
return fmt.Errorf("failed to access %s: %w", testURL, err)
}
return nil
}
// ScrapeProperties implements the main scraping logic
func (h *Connector) ScrapeProperties(opts common.ConnectorOptions) ([]common.Property, error) {
var properties []common.Property
var mu sync.Mutex
// Reset stats
h.mu.Lock()
h.stats = common.ConnectorStats{}
h.mu.Unlock()
// Set up collector callbacks
h.collector.OnHTML(".property-listing", func(e *colly.HTMLElement) {
property := h.parseProperty(e)
if property != nil {
mu.Lock()
properties = append(properties, *property)
h.mu.Lock()
h.stats.PropertiesCollected++
h.mu.Unlock()
mu.Unlock()
}
})
// Handle errors
h.collector.OnError(func(r *colly.Response, err error) {
h.mu.Lock()
h.stats.ErrorsEncountered++
h.mu.Unlock()
if opts.Verbose {
log.Printf("Error scraping %s: %v", r.Request.URL, err)
}
})
// Build search URL based on location
searchURL := h.buildSearchURL(opts.Location, opts.Pages)
// Visit search pages
for page := 1; page <= opts.Pages && len(properties) < opts.MaxProperties; page++ {
url := fmt.Sprintf("%s&page=%d", searchURL, page)
if opts.Verbose {
log.Printf("Scraping page %d: %s", page, url)
}
h.collector.Visit(url)
time.Sleep(h.rateLimit) // Respect rate limits
}
return properties[:min(len(properties), opts.MaxProperties)], nil
}
// parseProperty extracts property data from HTML element
func (h *Connector) parseProperty(e *colly.HTMLElement) *common.Property {
// Extract property details from HTML
address := strings.TrimSpace(e.ChildText(".address"))
priceText := strings.TrimSpace(e.ChildText(".price"))
// Parse price (remove currency symbols, commas)
price := h.parsePrice(priceText)
// Extract other fields
bedrooms := h.parseInt(e.ChildText(".bedrooms"))
bathrooms := h.parseFloat(e.ChildText(".bathrooms"))
// Get property URL
propertyURL := e.Request.AbsoluteURL(e.ChildAttr("a", "href"))
// Collect image URLs
var imageURLs []string
e.ForEach(".property-image img", func(_ int, img *colly.HTMLElement) {
imgURL := e.Request.AbsoluteURL(img.Attr("src"))
imageURLs = append(imageURLs, imgURL)
})
return &common.Property{
SourceID: h.extractSourceID(propertyURL),
Source: h.source,
Address: address,
Price: price,
PriceCurrency: h.detectCurrency(priceText),
Bedrooms: bedrooms,
Bathrooms: bathrooms,
PropertyType: strings.TrimSpace(e.ChildText(".property-type")),
Description: strings.TrimSpace(e.ChildText(".description")),
ImageURLs: imageURLs,
SourceURL: propertyURL,
CollectionTime: time.Now(),
Region: opts.Location,
}
}
// Helper functions
func (h *Connector) buildSearchURL(location string, pages int) string {
// Build the search URL based on your target website's structure
return fmt.Sprintf("https://www.example.com/search?location=%s", location)
}
func (h *Connector) parsePrice(priceText string) int {
// Remove currency symbols, commas, spaces
// Convert to integer (cents or base unit)
// Return 0 if parsing fails
return 0 // Implement your parsing logic
}
func (h *Connector) parseInt(text string) int {
// Extract integer from text
return 0 // Implement your parsing logic
}
func (h *Connector) parseFloat(text string) float64 {
// Extract float from text
return 0.0 // Implement your parsing logic
}
func (h *Connector) extractSourceID(url string) string {
// Extract unique ID from URL
return "" // Implement your extraction logic
}
func (h *Connector) detectCurrency(priceText string) string {
// Detect currency from price text
return "USD" // Implement your detection logic
}

Create connector_test.go to test your connector:

package my_new_connector
import (
"testing"
"connectors/common"
)
func TestNewConnector(t *testing.T) {
h := NewConnector()
if h == nil {
t.Fatal("NewConnector returned nil")
}
if h.GetName() != "my-new-connector" {
t.Errorf("Expected name 'my-new-connector', got '%s'", h.GetName())
}
if h.GetSource() != "example.com" {
t.Errorf("Expected source 'example.com', got '%s'", h.GetSource())
}
}
func TestHealthCheck(t *testing.T) {
h := NewConnector()
err := h.HealthCheck()
if err != nil {
t.Logf("Health check failed (may be expected): %v", err)
}
}
func TestScrapeProperties(t *testing.T) {
h := NewConnector()
opts := common.ConnectorOptions{
Location: "test-location",
Pages: 1,
MaxProperties: 5,
Delay: 2 * time.Second,
Verbose: true,
DryRun: true,
}
properties, err := h.ScrapeProperties(opts)
if err != nil {
t.Fatalf("ScrapeProperties failed: %v", err)
}
if len(properties) == 0 {
t.Log("No properties found (may be expected)")
}
}

Create compliance_test.go to ensure your connector implements the interface correctly:

package my_new_connector
import (
"testing"
"connectors/common"
)
// TestConnectorInterface ensures the connector implements all required methods
func TestConnectorInterface(t *testing.T) {
var _ common.Connector = (*Connector)(nil)
}

Add your connector to connectors/main.go:

import (
// ... existing imports
my_new_connector "connectors/sources/my-new-connector"
)
func main() {
registry := common.NewConnectorRegistry()
// ... existing registrations
registry.Register(my_new_connector.NewConnector())
// ... rest of main function
}

Also add a default location in the main() function’s location switch statement:

case "my-new-connector":
*location = "default-location"

Build and test your connector:

Terminal window
# From connectors directory
cd connectors
# Build the binary
go build -o bin/connector .
# Test your connector
./bin/connector -connector my-new-connector -location test -pages 1 -max 5 -dry-run -verbose
# Run health check
./bin/connector -connector my-new-connector -health-check
# Run tests
go test ./sources/my-new-connector/...

Always implement respectful rate limiting:

// Default: 2 seconds between requests
c.Limit(&colly.LimitRule{
DomainGlob: "*example.com*",
Parallelism: 1,
Delay: 2 * time.Second,
})

Handle errors gracefully:

h.collector.OnError(func(r *colly.Response, err error) {
h.mu.Lock()
h.stats.ErrorsEncountered++
h.mu.Unlock()
if opts.Verbose {
log.Printf("Error scraping %s: %v", r.Request.URL, err)
}
})

Validate extracted data:

func (h *Connector) parseProperty(e *colly.HTMLElement) *common.Property {
address := strings.TrimSpace(e.ChildText(".address"))
if address == "" {
return nil // Skip invalid properties
}
price := h.parsePrice(e.ChildText(".price"))
if price <= 0 {
return nil // Skip properties without valid price
}
// ... rest of parsing
}

Use realistic browser headers:

c := colly.NewCollector(
colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"),
)
c.OnRequest(func(r *colly.Request) {
r.Headers.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
r.Headers.Set("Accept-Language", "en-US,en;q=0.5")
r.Headers.Set("Referer", "https://www.example.com")
})

Ensure all required fields are populated:

  • SourceID: Unique identifier from the source
  • Source: Website domain
  • Address: Full property address
  • Price: Price as integer (in smallest currency unit)
  • PriceCurrency: Currency code (USD, EUR, etc.)
  • SourceURL: Direct link to property listing
  • CollectionTime: Timestamp when data was collected

Your connector automatically integrates with Mill API when run with API credentials:

Terminal window
./bin/connector \
-connector my-new-connector \
-location test \
-mill-api "http://localhost:4000/api/v1" \
-mill-api-key "your-token"

The connector will:

  1. Scrape properties
  2. Submit them to Mill API in batches
  3. Handle authentication automatically via token service

Handle paginated results:

for page := 1; page <= opts.Pages && len(properties) < opts.MaxProperties; page++ {
url := fmt.Sprintf("%s?page=%d", baseURL, page)
h.collector.Visit(url)
time.Sleep(h.rateLimit)
}

For JavaScript-rendered content, consider:

  • Using browser automation (Playwright, Selenium)
  • Finding API endpoints the site uses
  • Using headless browsers if necessary

If you encounter anti-bot measures:

  • Increase delays between requests
  • Rotate user agents
  • Use proxy rotation (if allowed)
  • Respect robots.txt
  • Consider official APIs if available

Issue: No properties found

  • Check CSS selectors match the website structure
  • Verify the search URL is correct
  • Enable verbose mode: -verbose
  • Check if the site requires authentication

Issue: Rate limiting / 429 errors

  • Increase delay: -delay 5s
  • Reduce parallelism
  • Check if site has API limits

Issue: Parsing errors

  • Validate HTML structure hasn’t changed
  • Add defensive parsing with fallbacks
  • Log raw HTML for debugging

Issue: Health check fails

  • Verify website is accessible
  • Check network connectivity
  • Ensure URL is correct

After creating your connector:

  1. Test thoroughly with various locations and parameters
  2. Run health checks regularly
  3. Monitor error rates in production
  4. Update selectors if website structure changes
  5. Document any special requirements or limitations

See existing connectors for reference:

  • connectors/sources/harcourts/ - Simple HTML scraping
  • connectors/sources/homes-co-nz/ - Complex pagination
  • connectors/sources/zillow/ - API-based approach